In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

COLS_TO_DROP = ['Index', 'First Name', 'Last Name', 'Birthday', 'Defense Against the Dark Arts', 'Arithmancy', 'Care of Magical Creatures', 'Birthday Month', 'Birthday Year', 'Birthday Weekday']
NUMERICAL_COLS = ['Astronomy', 'Herbology', 'Divination', 'Muggle Studies', 'Ancient Runes', 'Charms', 'Potions', 'Transfiguration', 'History of Magic', 'Flying']
CATEGORICAL_COLS = ['Best Hand']
LEARNING_RATE = 0.02
EPOCHS = 1000

In [2]:
data = pd.read_csv("datasets/dataset_train.csv")
data['Birthday'] = pd.to_datetime(data['Birthday'])
data['Birthday Weekday'] = data['Birthday'].dt.dayofweek
data['Birthday Year'] = data['Birthday'].dt.year
data['Birthday Month'] = data['Birthday'].dt.month
data = data.drop(columns=COLS_TO_DROP)
data

Unnamed: 0,Hogwarts House,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
0,Ravenclaw,Left,-487.886086,5.727180,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,-232.79405,-26.89
1,Slytherin,Right,-552.060507,-5.987446,-5.612,-487.340557,367.760303,4.107170,1058.944592,7.248742,-252.18425,-113.45
2,Ravenclaw,Left,-366.076117,7.725017,6.140,664.893521,602.585284,3.555579,1088.088348,8.728531,-227.34265,30.42
3,Gryffindor,Left,697.742809,-6.497214,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-256.84675,200.64
4,Gryffindor,Left,436.775204,-7.820623,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-256.38730,157.98
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,Gryffindor,Right,354.280086,-4.541837,5.702,-497.235066,618.220213,-5.231721,964.219853,3.389086,-250.39401,185.83
1596,Slytherin,Left,367.531174,6.061064,1.757,-643.271092,445.827565,2.238112,1056.147366,5.825263,-246.42719,44.80
1597,Gryffindor,Right,544.018925,-3.203269,6.065,-385.150457,635.211486,-5.984257,953.866685,1.709808,-251.63679,198.47
1598,Hufflepuff,Left,453.676219,3.442831,6.738,-831.741123,383.444937,3.813111,1087.949205,3.904100,-246.19072,-76.81


In [3]:
def ft_train_test_split(data, test_size=0.25, stratify_col=None, random_state=None):
    if not stratify_col:
        data_train = data.sample(frac=1-test_size, random_state=random_state)
    else:
        groups = data.groupby(stratify_col).groups
        data_train = pd.DataFrame()
        for group in groups.values():
            group_sample = data.iloc[group].sample(frac=1-test_size, random_state=random_state)
            data_train = pd.concat([data_train, group_sample])
    data_test = data.iloc[data.index.difference(data_train.index)]
    return (data_train, data_test)

In [4]:
data_train, data_test = ft_train_test_split(data, stratify_col='Hogwarts House')
display(data_train)
display(data_test)

Unnamed: 0,Hogwarts House,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
466,Gryffindor,Right,593.752848,-3.570549,6.015,-841.078296,587.566839,-6.826930,940.371543,1.852756,-252.99190,254.65
338,Gryffindor,Right,638.798582,-4.180734,7.630,-118.668554,636.777899,-2.603520,972.078371,7.213116,-247.10985,191.41
1330,Gryffindor,Left,610.482083,-4.617785,2.952,-870.835815,613.463772,-5.792643,952.103735,4.454273,-256.97819,218.63
1525,Gryffindor,Right,504.049457,4.865257,5.229,-432.652377,376.568392,3.096341,1050.592593,2.104184,-245.33296,-9.98
841,Gryffindor,Left,650.971389,-5.029783,3.974,-581.590246,593.560368,,953.470823,3.841006,-255.63398,174.50
...,...,...,...,...,...,...,...,...,...,...,...,...
60,Slytherin,Right,-461.409339,-5.887283,-6.869,-334.182727,395.512967,3.507092,1057.480476,9.408526,-252.45353,-68.80
1034,Slytherin,Left,-561.917999,-1.297966,-3.957,-595.268755,409.238764,3.492155,1046.302109,8.059229,-246.27332,-7.77
1461,Slytherin,Right,-586.956158,-2.754092,,-493.585466,422.615976,4.101186,1049.331979,9.807208,-245.54608,-12.45
199,Slytherin,Right,-604.253520,-3.346542,-4.936,-415.425616,430.235291,4.523609,1047.577330,11.389266,-245.75565,-2.64


Unnamed: 0,Hogwarts House,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
0,Ravenclaw,Left,-487.886086,5.727180,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,-232.79405,-26.89
2,Ravenclaw,Left,-366.076117,7.725017,6.140,664.893521,602.585284,3.555579,1088.088348,8.728531,-227.34265,30.42
3,Gryffindor,Left,697.742809,-6.497214,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-256.84675,200.64
7,Hufflepuff,Right,411.412727,5.931832,2.769,-502.021336,439.351416,,1041.091935,6.581791,-244.03492,72.25
9,Hufflepuff,Right,527.193585,7.922205,3.356,-398.101991,341.475606,4.978614,1041.414665,2.068824,-244.57527,-0.09
...,...,...,...,...,...,...,...,...,...,...,...,...
1588,Slytherin,Right,-492.510311,-2.208650,-7.270,-622.936567,,2.201186,1069.012391,9.117247,-250.91192,-19.50
1590,Ravenclaw,Right,-569.513380,6.915328,3.958,559.775978,543.431180,2.882075,1065.526524,5.038939,-229.84860,17.86
1592,Gryffindor,Right,376.920722,-2.949527,7.311,-416.672294,624.056215,-6.638366,954.799304,-0.105304,-249.35589,193.78
1593,Ravenclaw,Left,-426.175401,5.681107,6.205,473.879478,647.238809,6.254227,1046.815627,7.206156,-230.80139,-29.82


In [5]:
X_train = data_train.drop(columns=['Hogwarts House'])
data_train['House Slytherin'] = (data_train['Hogwarts House'] == 'Slytherin').astype(int)
data_train['House Hufflepuff'] = (data_train['Hogwarts House'] == 'Hufflepuff').astype(int)
data_train['House Gryffindor'] = (data_train['Hogwarts House'] == 'Gryffindor').astype(int)
data_train['House Ravenclaw'] = (data_train['Hogwarts House'] == 'Ravenclaw').astype(int)
Y_train = data_train[[col for col in data_train.columns if col.startswith('House ')]]
display(X_train)
display(Y_train)

Unnamed: 0,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
466,Right,593.752848,-3.570549,6.015,-841.078296,587.566839,-6.826930,940.371543,1.852756,-252.99190,254.65
338,Right,638.798582,-4.180734,7.630,-118.668554,636.777899,-2.603520,972.078371,7.213116,-247.10985,191.41
1330,Left,610.482083,-4.617785,2.952,-870.835815,613.463772,-5.792643,952.103735,4.454273,-256.97819,218.63
1525,Right,504.049457,4.865257,5.229,-432.652377,376.568392,3.096341,1050.592593,2.104184,-245.33296,-9.98
841,Left,650.971389,-5.029783,3.974,-581.590246,593.560368,,953.470823,3.841006,-255.63398,174.50
...,...,...,...,...,...,...,...,...,...,...,...
60,Right,-461.409339,-5.887283,-6.869,-334.182727,395.512967,3.507092,1057.480476,9.408526,-252.45353,-68.80
1034,Left,-561.917999,-1.297966,-3.957,-595.268755,409.238764,3.492155,1046.302109,8.059229,-246.27332,-7.77
1461,Right,-586.956158,-2.754092,,-493.585466,422.615976,4.101186,1049.331979,9.807208,-245.54608,-12.45
199,Right,-604.253520,-3.346542,-4.936,-415.425616,430.235291,4.523609,1047.577330,11.389266,-245.75565,-2.64


Unnamed: 0,House Slytherin,House Hufflepuff,House Gryffindor,House Ravenclaw
466,0,0,1,0
338,0,0,1,0
1330,0,0,1,0
1525,0,0,1,0
841,0,0,1,0
...,...,...,...,...
60,1,0,0,0
1034,1,0,0,0
1461,1,0,0,0
199,1,0,0,0


In [6]:
X_test = data_test.drop(columns=['Hogwarts House'])
data_test['House Slytherin'] = (data_test['Hogwarts House'] == 'Slytherin').astype(int)
data_test['House Hufflepuff'] = (data_test['Hogwarts House'] == 'Hufflepuff').astype(int)
data_test['House Gryffindor'] = (data_test['Hogwarts House'] == 'Gryffindor').astype(int)
data_test['House Ravenclaw'] = (data_test['Hogwarts House'] == 'Ravenclaw').astype(int)
Y_test = data_test[[col for col in data_test.columns if col.startswith('House ')]]
display(X_test)
display(Y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test['House Slytherin'] = (data_test['Hogwarts House'] == 'Slytherin').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test['House Hufflepuff'] = (data_test['Hogwarts House'] == 'Hufflepuff').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test['House Gryffindor

Unnamed: 0,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
0,Left,-487.886086,5.727180,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,-232.79405,-26.89
2,Left,-366.076117,7.725017,6.140,664.893521,602.585284,3.555579,1088.088348,8.728531,-227.34265,30.42
3,Left,697.742809,-6.497214,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-256.84675,200.64
7,Right,411.412727,5.931832,2.769,-502.021336,439.351416,,1041.091935,6.581791,-244.03492,72.25
9,Right,527.193585,7.922205,3.356,-398.101991,341.475606,4.978614,1041.414665,2.068824,-244.57527,-0.09
...,...,...,...,...,...,...,...,...,...,...,...
1588,Right,-492.510311,-2.208650,-7.270,-622.936567,,2.201186,1069.012391,9.117247,-250.91192,-19.50
1590,Right,-569.513380,6.915328,3.958,559.775978,543.431180,2.882075,1065.526524,5.038939,-229.84860,17.86
1592,Right,376.920722,-2.949527,7.311,-416.672294,624.056215,-6.638366,954.799304,-0.105304,-249.35589,193.78
1593,Left,-426.175401,5.681107,6.205,473.879478,647.238809,6.254227,1046.815627,7.206156,-230.80139,-29.82


Unnamed: 0,House Slytherin,House Hufflepuff,House Gryffindor,House Ravenclaw
0,0,0,0,1
2,0,0,0,1
3,0,0,1,0
7,0,1,0,0
9,0,1,0,0
...,...,...,...,...
1588,1,0,0,0
1590,0,0,0,1
1592,0,0,1,0
1593,0,0,0,1


# Missing values

In [7]:
X = pd.concat([X_train, X_test])
Y = pd.concat([Y_train, Y_test])

In [8]:
X.isna().sum() / len(X) * 100

Best Hand           0.0000
Astronomy           2.0000
Herbology           2.0625
Divination          2.4375
Muggle Studies      2.1875
Ancient Runes       2.1875
History of Magic    2.6875
Transfiguration     2.1250
Potions             1.8750
Charms              0.0000
Flying              0.0000
dtype: float64

In [9]:
Y.isna().sum() / len(Y) * 100

House Slytherin     0.0
House Hufflepuff    0.0
House Gryffindor    0.0
House Ravenclaw     0.0
dtype: float64

In [10]:
(Y['House Slytherin'] + Y['House Gryffindor'] + Y['House Ravenclaw'] + Y['House Hufflepuff']).value_counts()

1    1600
Name: count, dtype: int64

# Preprocessings

In [11]:
from logreg_train import SimpleImputer, StandardScaler, OneHotEncoder, PreprocessorPipeline

imputer = SimpleImputer(NUMERICAL_COLS, CATEGORICAL_COLS)
scaler = StandardScaler(NUMERICAL_COLS)
ohe = OneHotEncoder(CATEGORICAL_COLS)
preprocessor = PreprocessorPipeline([imputer, scaler, ohe])
preprocessor

--- SimpleImputer ---
Means: None
Modes: None

--- StandardScaler ---
Means: None
Standard Deviations: None

--- OneHotEncoder ---
Columns mapping: {}
Drop Last: True


In [12]:
preprocessor.fit(X_train)
preprocessor

--- SimpleImputer ---
Means: {'Astronomy': 44.0177774297825, 'Herbology': 1.1026250198738663, 'Divination': 3.1838502994011977, 'Muggle Studies': -219.331500824042, 'Ancient Runes': 495.73133056537216, 'Charms': -243.40472560833334, 'Potions': 5.924427807832688, 'Transfiguration': 1030.1816975753563, 'History of Magic': 2.9714009082453448, 'Flying': 21.309700000000003}
Modes: {'Best Hand': 'Right'}

--- StandardScaler ---
Means: {'Astronomy': 44.0177774297825, 'Herbology': 1.1026250198738663, 'Divination': 3.1838502994011977, 'Muggle Studies': -219.331500824042, 'Ancient Runes': 495.73133056537216, 'Charms': -243.40472560833334, 'Potions': 5.924427807832688, 'Transfiguration': 1030.1816975753563, 'History of Magic': 2.9714009082453448, 'Flying': 21.309700000000003}
Standard Deviations: {'Astronomy': 520.1082337501274, 'Herbology': 5.237459825145416, 'Divination': 4.176232515841511, 'Muggle Studies': 486.40762235933954, 'Ancient Runes': 106.5017533896027, 'Charms': 8.844699713642763, 'P

In [13]:
X_train_preprocessed = preprocessor.transform(X_train)
X_train_preprocessed

Unnamed: 0,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,Best Hand_Left
466,1.056963,-0.892260,0.677920,-1.278242,0.862291,-2.213049,-2.018118,-1.275665,-1.083946,2.380882,0
338,1.143571,-1.008764,1.064632,0.206952,1.324359,-1.259150,-1.305636,0.403749,-0.418909,1.735614,0
1330,1.089128,-1.092211,-0.055517,-1.339420,1.105451,-1.979445,-1.754485,-0.460603,-1.534644,2.013353,1
1525,0.884492,0.718408,0.489712,-0.438564,-1.118882,0.028219,0.458652,-1.196892,-0.218010,-0.319264,0
841,1.166976,-1.170874,0.189202,-0.744764,0.918567,0.000000,-1.723765,-0.652741,-1.382665,1.563074,1
...,...,...,...,...,...,...,...,...,...,...,...
60,-0.971773,-1.334599,-2.407158,-0.236121,-0.941002,0.120991,0.613429,1.091577,-1.023076,-0.919432,0
1034,-1.165019,-0.458350,-1.709879,-0.772885,-0.812123,0.117617,0.362241,0.668839,-0.324329,-0.296714,1
1461,-1.213159,-0.736372,0.000000,-0.563836,-0.686518,0.255173,0.430325,1.216485,-0.242106,-0.344466,0
199,-1.246416,-0.849489,-1.944300,-0.403148,-0.614976,0.350581,0.390896,1.712148,-0.265800,-0.244370,0


In [14]:
from logreg_train import SortingHat

sorting_hat = SortingHat(X_train_preprocessed.shape[1], lr=LEARNING_RATE)
sorting_hat

<logreg_train.SortingHat at 0x7f0e6a349360>

In [15]:
X_test_preprocessed = preprocessor.transform(X_test)
X_test_preprocessed

Unnamed: 0,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,Best Hand_Left
0,-1.022679,0.882977,0.368310,1.010197,0.345092,0.510366,0.215869,-0.668606,1.199665,-0.491805,1
2,-0.788478,1.264428,0.707851,1.817868,1.003307,0.131942,1.301216,0.878533,1.816011,0.092957,1
3,1.256902,-1.451054,0.201653,-0.653093,0.265261,-1.757423,-2.467089,-1.598632,-1.519783,1.829793,1
7,0.706382,0.922051,-0.099336,-0.581179,-0.529380,0.000000,0.245163,0.205954,-0.071251,0.519768,0
9,0.928991,1.302078,0.041221,-0.367532,-1.448387,0.453349,0.252415,-1.207970,-0.132344,-0.218351,0
...,...,...,...,...,...,...,...,...,...,...,...
1588,-1.031570,-0.632229,-2.503177,-0.829767,0.000000,-0.173961,0.872562,1.000318,-0.848779,-0.416401,0
1590,-1.179622,1.109833,0.185370,1.601758,0.447879,-0.020175,0.794231,-0.277426,1.532684,-0.035199,0
1592,0.640065,-0.773687,0.988247,-0.405711,1.204909,-2.170460,-1.693913,-1.889130,-0.672851,1.759797,0
1593,-0.904029,0.874180,0.723415,1.425165,1.422582,0.741458,0.373780,0.401569,1.424959,-0.521701,1


In [16]:
for i in range(EPOCHS):
    sorting_hat.train_step(X_train_preprocessed, Y_train, X_test_preprocessed, Y_test)

In [17]:
data_train['pred'] = sorting_hat.predict(X_train_preprocessed)
houses = {0 : 'Slytherin', 1 : 'Hufflepuff', 2 : 'Gryffindor', 3 : 'Ravenclaw'}
data_train = data_train.replace({'pred':houses})
display(data_train)
data_train['true']=(data_train['Hogwarts House'] == data_train['pred'])
data_train['true'].sum() / len(data_train)

Unnamed: 0,Hogwarts House,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,House Slytherin,House Hufflepuff,House Gryffindor,House Ravenclaw,pred
466,Gryffindor,Right,593.752848,-3.570549,6.015,-841.078296,587.566839,-6.826930,940.371543,1.852756,-252.99190,254.65,0,0,1,0,Gryffindor
338,Gryffindor,Right,638.798582,-4.180734,7.630,-118.668554,636.777899,-2.603520,972.078371,7.213116,-247.10985,191.41,0,0,1,0,Gryffindor
1330,Gryffindor,Left,610.482083,-4.617785,2.952,-870.835815,613.463772,-5.792643,952.103735,4.454273,-256.97819,218.63,0,0,1,0,Gryffindor
1525,Gryffindor,Right,504.049457,4.865257,5.229,-432.652377,376.568392,3.096341,1050.592593,2.104184,-245.33296,-9.98,0,0,1,0,Hufflepuff
841,Gryffindor,Left,650.971389,-5.029783,3.974,-581.590246,593.560368,,953.470823,3.841006,-255.63398,174.50,0,0,1,0,Gryffindor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,Slytherin,Right,-461.409339,-5.887283,-6.869,-334.182727,395.512967,3.507092,1057.480476,9.408526,-252.45353,-68.80,1,0,0,0,Slytherin
1034,Slytherin,Left,-561.917999,-1.297966,-3.957,-595.268755,409.238764,3.492155,1046.302109,8.059229,-246.27332,-7.77,1,0,0,0,Slytherin
1461,Slytherin,Right,-586.956158,-2.754092,,-493.585466,422.615976,4.101186,1049.331979,9.807208,-245.54608,-12.45,1,0,0,0,Slytherin
199,Slytherin,Right,-604.253520,-3.346542,-4.936,-415.425616,430.235291,4.523609,1047.577330,11.389266,-245.75565,-2.64,1,0,0,0,Slytherin


0.9825

In [18]:
data_test = data.iloc[X_test.index]

In [19]:
data_test['pred'] = sorting_hat.predict(X_test_preprocessed)
houses = {0 : 'Slytherin', 1 : 'Hufflepuff', 2 : 'Gryffindor', 3 : 'Ravenclaw'}
data_test = data_test.replace({'pred':houses})
data_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test['pred'] = sorting_hat.predict(X_test_preprocessed)


Unnamed: 0,Hogwarts House,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,pred
0,Ravenclaw,Left,-487.886086,5.727180,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,-232.79405,-26.89,Ravenclaw
2,Ravenclaw,Left,-366.076117,7.725017,6.140,664.893521,602.585284,3.555579,1088.088348,8.728531,-227.34265,30.42,Ravenclaw
3,Gryffindor,Left,697.742809,-6.497214,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-256.84675,200.64,Gryffindor
7,Hufflepuff,Right,411.412727,5.931832,2.769,-502.021336,439.351416,,1041.091935,6.581791,-244.03492,72.25,Hufflepuff
9,Hufflepuff,Right,527.193585,7.922205,3.356,-398.101991,341.475606,4.978614,1041.414665,2.068824,-244.57527,-0.09,Hufflepuff
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1588,Slytherin,Right,-492.510311,-2.208650,-7.270,-622.936567,,2.201186,1069.012391,9.117247,-250.91192,-19.50,Slytherin
1590,Ravenclaw,Right,-569.513380,6.915328,3.958,559.775978,543.431180,2.882075,1065.526524,5.038939,-229.84860,17.86,Ravenclaw
1592,Gryffindor,Right,376.920722,-2.949527,7.311,-416.672294,624.056215,-6.638366,954.799304,-0.105304,-249.35589,193.78,Gryffindor
1593,Ravenclaw,Left,-426.175401,5.681107,6.205,473.879478,647.238809,6.254227,1046.815627,7.206156,-230.80139,-29.82,Ravenclaw


In [20]:
losses = pd.DataFrame(sorting_hat.losses)
fig = go.Figure()
fig.add_trace(go.Line(x = losses['step'], y = losses['train_loss']))
fig.add_trace(go.Line(x = losses['step'], y = losses['test_loss']))
fig.show()


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [21]:
losses = pd.DataFrame(sorting_hat.losses)
fig = go.Figure()
fig.add_trace(go.Line(x = losses['step'][-100:], y = losses['train_loss'][-100:]))
fig.add_trace(go.Line(x = losses['step'][-100:], y = losses['test_loss'][-100:]))
fig.show()

In [22]:
data_train['Hogwarts House'].value_counts() / len(data_train)

Hogwarts House
Hufflepuff    0.330833
Ravenclaw     0.276667
Gryffindor    0.204167
Slytherin     0.188333
Name: count, dtype: float64

In [23]:
data_test['Hogwarts House'].value_counts() / len(data_test)

Hogwarts House
Hufflepuff    0.3300
Ravenclaw     0.2775
Gryffindor    0.2050
Slytherin     0.1875
Name: count, dtype: float64

In [24]:
data['Hogwarts House'].value_counts() / len(data)

Hogwarts House
Hufflepuff    0.330625
Ravenclaw     0.276875
Gryffindor    0.204375
Slytherin     0.188125
Name: count, dtype: float64

In [25]:
feature_importance_s = {X.columns[i]: sorting_hat.logreg_s.weights[i] for i in range(X.shape[1])}
feature_importance_h = {X.columns[i]: sorting_hat.logreg_h.weights[i] for i in range(X.shape[1])}
feature_importance_g = {X.columns[i]: sorting_hat.logreg_g.weights[i] for i in range(X.shape[1])}
feature_importance_r = {X.columns[i]: sorting_hat.logreg_r.weights[i] for i in range(X.shape[1])}
feature_importance = pd.DataFrame([feature_importance_s, feature_importance_h, feature_importance_g, feature_importance_r],
                                  index=['House Slytherin', 'House Hufflepuff', 'House Gryffindor', 'House Ravenclaw']).T

feature_importance


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



Unnamed: 0,House Slytherin,House Hufflepuff,House Gryffindor,House Ravenclaw
Best Hand,-0.765056,1.393227,0.365009,-0.833845
Astronomy,-0.75675,1.106799,-0.614063,0.454899
Herbology,-1.221435,0.739829,0.253233,0.294358
Divination,-0.268591,-0.842493,-0.16624,1.116506
Muggle Studies,-0.429298,-1.238581,0.533232,0.907545
Ancient Runes,0.152799,0.602224,-0.749521,0.145197
History of Magic,0.222531,0.555567,-0.813195,0.162322
Transfiguration,0.655496,-0.494572,-0.341443,0.161667
Potions,-0.44253,-0.191575,-0.444165,1.056449
Charms,-0.495227,-0.337075,0.791038,-0.033514


In [26]:
data_test['true']=(data_test['Hogwarts House'] == data_test['pred'])
print(data_train['true'].sum() / len(data_train))
print(data_test['true'].sum() / len(data_test))

0.9825
0.98


In [27]:
px.bar(feature_importance, labels={
                     "index": "Feature",
                     "value": "Importance"
                 })

In [28]:
data = pd.read_csv("datasets/dataset_train.csv")
data['Birthday'] = pd.to_datetime(data['Birthday'])
data['Birthday Weekday'] = data['Birthday'].dt.dayofweek
data['Birthday Year'] = data['Birthday'].dt.year
data['Birthday Month'] = data['Birthday'].dt.month
data = data.drop(columns=COLS_TO_DROP)
data

Unnamed: 0,Hogwarts House,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
0,Ravenclaw,Left,-487.886086,5.727180,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,-232.79405,-26.89
1,Slytherin,Right,-552.060507,-5.987446,-5.612,-487.340557,367.760303,4.107170,1058.944592,7.248742,-252.18425,-113.45
2,Ravenclaw,Left,-366.076117,7.725017,6.140,664.893521,602.585284,3.555579,1088.088348,8.728531,-227.34265,30.42
3,Gryffindor,Left,697.742809,-6.497214,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-256.84675,200.64
4,Gryffindor,Left,436.775204,-7.820623,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-256.38730,157.98
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,Gryffindor,Right,354.280086,-4.541837,5.702,-497.235066,618.220213,-5.231721,964.219853,3.389086,-250.39401,185.83
1596,Slytherin,Left,367.531174,6.061064,1.757,-643.271092,445.827565,2.238112,1056.147366,5.825263,-246.42719,44.80
1597,Gryffindor,Right,544.018925,-3.203269,6.065,-385.150457,635.211486,-5.984257,953.866685,1.709808,-251.63679,198.47
1598,Hufflepuff,Left,453.676219,3.442831,6.738,-831.741123,383.444937,3.813111,1087.949205,3.904100,-246.19072,-76.81


In [29]:
X = data.drop(columns=['Hogwarts House'])
data['House Slytherin'] = (data['Hogwarts House'] == 'Slytherin').astype(int)
data['House Hufflepuff'] = (data['Hogwarts House'] == 'Hufflepuff').astype(int)
data['House Gryffindor'] = (data['Hogwarts House'] == 'Gryffindor').astype(int)
data['House Ravenclaw'] = (data['Hogwarts House'] == 'Ravenclaw').astype(int)
Y = data[[col for col in data.columns if col.startswith('House ')]]
display(X)
display(Y)

Unnamed: 0,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
0,Left,-487.886086,5.727180,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,-232.79405,-26.89
1,Right,-552.060507,-5.987446,-5.612,-487.340557,367.760303,4.107170,1058.944592,7.248742,-252.18425,-113.45
2,Left,-366.076117,7.725017,6.140,664.893521,602.585284,3.555579,1088.088348,8.728531,-227.34265,30.42
3,Left,697.742809,-6.497214,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-256.84675,200.64
4,Left,436.775204,-7.820623,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-256.38730,157.98
...,...,...,...,...,...,...,...,...,...,...,...
1595,Right,354.280086,-4.541837,5.702,-497.235066,618.220213,-5.231721,964.219853,3.389086,-250.39401,185.83
1596,Left,367.531174,6.061064,1.757,-643.271092,445.827565,2.238112,1056.147366,5.825263,-246.42719,44.80
1597,Right,544.018925,-3.203269,6.065,-385.150457,635.211486,-5.984257,953.866685,1.709808,-251.63679,198.47
1598,Left,453.676219,3.442831,6.738,-831.741123,383.444937,3.813111,1087.949205,3.904100,-246.19072,-76.81


Unnamed: 0,House Slytherin,House Hufflepuff,House Gryffindor,House Ravenclaw
0,0,0,0,1
1,1,0,0,0
2,0,0,0,1
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
1595,0,0,1,0
1596,1,0,0,0
1597,0,0,1,0
1598,0,1,0,0


In [30]:
imputer = SimpleImputer(NUMERICAL_COLS, CATEGORICAL_COLS)
scaler = StandardScaler(NUMERICAL_COLS)
ohe = OneHotEncoder(CATEGORICAL_COLS)
preprocessor = PreprocessorPipeline([imputer, scaler, ohe])
preprocessor

--- SimpleImputer ---
Means: None
Modes: None

--- StandardScaler ---
Means: None
Standard Deviations: None

--- OneHotEncoder ---
Columns mapping: {}
Drop Last: True


In [31]:
preprocessor.fit(X)
display(preprocessor)
X_preprocessed = preprocessor.transform(X)
X_preprocessed

--- SimpleImputer ---
Means: {'Astronomy': 39.79713089016475, 'Herbology': 1.1410195296768046, 'Divination': 3.1539096732863547, 'Muggle Studies': -224.58991486346417, 'Ancient Runes': 495.74797005915786, 'Charms': -243.3744090125, 'Potions': 5.950372992780089, 'Transfiguration': 1030.0969463871306, 'History of Magic': 2.9630946151165936, 'Flying': 21.9580125}
Modes: {'Best Hand': 'Right'}

--- StandardScaler ---
Means: {'Astronomy': 39.79713089016475, 'Herbology': 1.1410195296768046, 'Divination': 3.1539096732863547, 'Muggle Studies': -224.58991486346417, 'Ancient Runes': 495.74797005915786, 'Charms': -243.3744090125, 'Potions': 5.950372992780089, 'Transfiguration': 1030.0969463871306, 'History of Magic': 2.9630946151165936, 'Flying': 21.9580125}
Standard Deviations: {'Astronomy': 520.2982676051708, 'Herbology': 5.2196819935318235, 'Divination': 4.155300897977581, 'Muggle Studies': 486.34483965206664, 'Ancient Runes': 106.28516457845274, 'Charms': 8.783639876017117, 'Potions': 3.14785

Unnamed: 0,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,Best Hand_Left
0,-1.014194,0.878628,0.377371,1.021139,0.345639,0.512444,0.219633,-0.686183,1.204553,-0.500330,1
1,-1.137535,-1.365690,-2.109573,-0.540256,-1.204191,0.258503,0.653769,0.412462,-1.002983,-1.386928,0
2,-0.780078,1.261379,0.718622,1.828915,1.005195,0.133871,1.314249,0.882556,1.825184,0.086673,1
3,1.264555,-1.463352,0.209874,-0.642366,0.265645,-1.756242,-2.486237,-1.629193,-1.533799,1.830165,1
4,0.762982,-1.716894,-0.220901,-0.451681,0.974516,-1.447763,-2.099988,-0.520770,-1.481492,1.393217,1
...,...,...,...,...,...,...,...,...,...,...,...
1595,0.604428,-1.088736,0.613214,-0.560600,1.152299,-1.851612,-1.492961,-0.813661,-0.799168,1.678473,0
1596,0.629896,0.942595,-0.336175,-0.860873,-0.469684,-0.163809,0.590376,-0.039745,-0.347553,0.233961,1
1597,0.969101,-0.832290,0.700573,-0.330137,1.312164,-2.021646,-1.727593,-1.347129,-0.940656,1.807939,0
1598,0.795465,0.440987,0.862534,-1.248397,-1.056620,0.192060,1.311096,-0.650053,-0.320631,-1.011640,1


In [32]:
sorting_hat = SortingHat(X_preprocessed.shape[1], lr=LEARNING_RATE)
for i in range(EPOCHS):
    sorting_hat.train_step(X_preprocessed, Y)

In [33]:
Y

Unnamed: 0,House Slytherin,House Hufflepuff,House Gryffindor,House Ravenclaw
0,0,0,0,1
1,1,0,0,0
2,0,0,0,1
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
1595,0,0,1,0
1596,1,0,0,0
1597,0,0,1,0
1598,0,1,0,0


In [34]:
sorting_hat.predict(X_preprocessed)

0       3
1       0
2       3
3       2
4       2
       ..
1595    2
1596    1
1597    2
1598    1
1599    1
Length: 1600, dtype: int64

In [35]:
data['pred'] = sorting_hat.predict(X_preprocessed)
houses = {0 : 'Slytherin', 1 : 'Hufflepuff', 2 : 'Gryffindor', 3 : 'Ravenclaw'}
data = data.replace({'pred':houses})
display(data.loc[X_preprocessed.index[0], :])
data['true']=(data['Hogwarts House'] == data['pred'])
data['true'].sum() / len(data)

Hogwarts House        Ravenclaw
Best Hand                  Left
Astronomy           -487.886086
Herbology               5.72718
Divination                4.722
Muggle Studies       272.035831
Ancient Runes        532.484226
History of Magic       5.231058
Transfiguration     1039.788281
Potions                3.790369
Charms               -232.79405
Flying                   -26.89
House Slytherin               0
House Hufflepuff              0
House Gryffindor              0
House Ravenclaw               1
pred                  Ravenclaw
Name: 0, dtype: object

0.981875

In [36]:
losses = pd.DataFrame(sorting_hat.losses)
fig = go.Figure()
fig.add_trace(go.Line(x = losses['step'], y = losses['train_loss']))
fig.show()


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [37]:
losses = pd.DataFrame(sorting_hat.losses)
fig = go.Figure()
fig.add_trace(go.Line(x = losses['step'][-100:], y = losses['train_loss'][-100:]))
fig.show()

# Predictions

In [38]:
test = pd.read_csv("datasets/dataset_test.csv")
test['Birthday'] = pd.to_datetime(test['Birthday'])
test['Birthday Weekday'] = test['Birthday'].dt.dayofweek
test['Birthday Year'] = test['Birthday'].dt.year
test['Birthday Month'] = test['Birthday'].dt.month
test = test.drop(columns=COLS_TO_DROP + ['Hogwarts House'])
test

Unnamed: 0,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
0,Right,696.096071,3.020172,7.996,-365.151850,393.138185,4.207691,1046.742736,3.668983,-244.48172,-13.62
1,Left,-370.844655,2.965226,6.349,522.580486,602.853051,6.460017,1048.053878,8.514622,-231.29200,-26.26
2,Left,320.303990,-6.185697,4.619,-630.073207,588.071795,-5.565818,936.437358,1.850829,-252.99343,200.15
3,Right,407.202928,4.962442,,-449.179806,427.699966,,1043.397718,4.656573,-244.01660,-11.15
4,Right,288.337747,3.737656,4.886,-449.732166,385.712782,2.876347,1051.377936,2.750586,-243.99806,-7.12
...,...,...,...,...,...,...,...,...,...,...,...
395,Left,-554.181932,-5.647655,-3.799,-591.764651,392.973420,7.048482,1047.648405,10.408749,-248.39978,-94.89
396,Left,632.233530,6.754862,3.294,-221.848397,319.360250,3.921402,1035.681313,-0.169741,-246.87982,-15.53
397,Right,292.108738,5.234530,4.230,-787.036050,433.259967,3.898160,1069.794110,6.495579,-244.01333,1.25
398,Left,-726.418553,6.735582,3.908,511.960762,613.391514,7.244499,1042.058804,7.554259,-228.24290,-18.27


In [39]:
test_preprocessed = preprocessor.transform(test)
test_preprocessed

Unnamed: 0,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,Best Hand_Left
0,1.261390,0.360013,1.165280,-0.289017,-0.965420,0.281215,0.377241,-0.724744,-0.126065,-0.364411,0
1,-0.789243,0.349486,0.768919,1.536298,1.007714,0.790127,0.406955,0.814602,1.375558,-0.493877,1
2,0.539127,-1.403671,0.352583,-0.833736,0.868643,-1.927101,-2.122591,-1.302330,-1.095106,1.825147,1
3,0.706145,0.732118,0.000000,-0.461791,-0.640240,0.000000,0.301433,-0.411010,-0.073112,-0.339112,0
4,0.477689,0.497470,0.416839,-0.462927,-1.035283,-0.019601,0.482287,-1.016498,-0.071001,-0.297834,0
...,...,...,...,...,...,...,...,...,...,...,...
395,-1.141613,-1.300592,-1.673263,-0.754968,-0.966970,0.923090,0.397766,1.416322,-0.572129,-1.196826,1
396,1.138648,1.075514,0.033714,0.005637,-1.659570,0.216529,0.126558,-1.944218,-0.399084,-0.383974,1
397,0.484936,0.784245,0.258968,-1.156476,-0.587928,0.211277,0.899650,0.173199,-0.072740,-0.212104,0
398,-1.472647,1.071821,0.181477,1.514462,1.106867,0.967380,0.271090,0.509517,1.722692,-0.412039,1


In [40]:
sorting_hat.predict(test_preprocessed)

0      1
1      3
2      2
3      1
4      1
      ..
395    0
396    1
397    1
398    3
399    3
Length: 400, dtype: int64

In [41]:
predictions = sorting_hat.predict(test_preprocessed)
houses = {0 : 'Slytherin', 1 : 'Hufflepuff', 2 : 'Gryffindor', 3 : 'Ravenclaw'}
predictions = predictions.replace(houses)
display(predictions)

0      Hufflepuff
1       Ravenclaw
2      Gryffindor
3      Hufflepuff
4      Hufflepuff
          ...    
395     Slytherin
396    Hufflepuff
397    Hufflepuff
398     Ravenclaw
399     Ravenclaw
Length: 400, dtype: object

In [42]:
truth = pd.read_csv("~/Downloads/dataset_truth.csv")
truth

Unnamed: 0,Index,Hogwarts House
0,0,Hufflepuff
1,1,Ravenclaw
2,2,Gryffindor
3,3,Hufflepuff
4,4,Hufflepuff
...,...,...
395,395,Slytherin
396,396,Hufflepuff
397,397,Hufflepuff
398,398,Ravenclaw


In [43]:
truth['pred'] = predictions

In [44]:
truth

Unnamed: 0,Index,Hogwarts House,pred
0,0,Hufflepuff,Hufflepuff
1,1,Ravenclaw,Ravenclaw
2,2,Gryffindor,Gryffindor
3,3,Hufflepuff,Hufflepuff
4,4,Hufflepuff,Hufflepuff
...,...,...,...
395,395,Slytherin,Slytherin
396,396,Hufflepuff,Hufflepuff
397,397,Hufflepuff,Hufflepuff
398,398,Ravenclaw,Ravenclaw


In [45]:
len(truth[truth['pred'] == truth['Hogwarts House']]) / len(truth) * 100

99.0