In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

COLS_TO_DROP = ['Index', 'First Name', 'Last Name', 'Birthday', 'Defense Against the Dark Arts', 'Arithmancy', 'Care of Magical Creatures', 'Birthday Month', 'Birthday Year', 'Birthday Weekday']
NUMERICAL_COLS = ['Astronomy', 'Herbology', 'Divination', 'Muggle Studies', 'Ancient Runes', 'Charms', 'Potions', 'Transfiguration', 'History of Magic', 'Flying']
CATEGORICAL_COLS = ['Best Hand']
LEARNING_RATE = 0.02
TARGET_CLASSES = ['Slytherin', 'Hufflepuff', 'Gryffindor', 'Ravenclaw']
EPOCHS = 1000

In [2]:
data = pd.read_csv("datasets/dataset_train.csv")
data['Birthday'] = pd.to_datetime(data['Birthday'])
data['Birthday Weekday'] = data['Birthday'].dt.dayofweek
data['Birthday Year'] = data['Birthday'].dt.year
data['Birthday Month'] = data['Birthday'].dt.month
data = data.drop(columns=COLS_TO_DROP)
data

Unnamed: 0,Hogwarts House,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
0,Ravenclaw,Left,-487.886086,5.727180,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,-232.79405,-26.89
1,Slytherin,Right,-552.060507,-5.987446,-5.612,-487.340557,367.760303,4.107170,1058.944592,7.248742,-252.18425,-113.45
2,Ravenclaw,Left,-366.076117,7.725017,6.140,664.893521,602.585284,3.555579,1088.088348,8.728531,-227.34265,30.42
3,Gryffindor,Left,697.742809,-6.497214,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-256.84675,200.64
4,Gryffindor,Left,436.775204,-7.820623,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-256.38730,157.98
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,Gryffindor,Right,354.280086,-4.541837,5.702,-497.235066,618.220213,-5.231721,964.219853,3.389086,-250.39401,185.83
1596,Slytherin,Left,367.531174,6.061064,1.757,-643.271092,445.827565,2.238112,1056.147366,5.825263,-246.42719,44.80
1597,Gryffindor,Right,544.018925,-3.203269,6.065,-385.150457,635.211486,-5.984257,953.866685,1.709808,-251.63679,198.47
1598,Hufflepuff,Left,453.676219,3.442831,6.738,-831.741123,383.444937,3.813111,1087.949205,3.904100,-246.19072,-76.81


In [3]:
def ft_train_test_split(data, test_size=0.25, stratify_col=None, random_state=None):
    if not stratify_col:
        data_train = data.sample(frac=1-test_size, random_state=random_state)
    else:
        groups = data.groupby(stratify_col).groups
        data_train = pd.DataFrame()
        for group in groups.values():
            group_sample = data.iloc[group].sample(frac=1-test_size, random_state=random_state)
            data_train = pd.concat([data_train, group_sample])
    data_test = data.iloc[data.index.difference(data_train.index)]
    return (data_train, data_test)

In [4]:
data_train, data_test = ft_train_test_split(data, stratify_col='Hogwarts House')
display(data_train)
display(data_test)

Unnamed: 0,Hogwarts House,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
286,Gryffindor,Left,651.499961,-4.316909,5.471,-499.027269,607.281653,-5.288429,942.123720,3.149411,-253.00483,223.19
147,Gryffindor,Right,489.894253,-3.653055,6.509,-349.773327,627.385103,-4.990578,972.272775,4.661389,-248.89420,201.19
412,Gryffindor,Left,509.980158,-7.440085,3.520,-546.684843,567.109244,-3.090562,935.334722,3.644900,-255.70319,161.23
556,Gryffindor,Left,783.278014,-5.473951,2.576,,582.845083,-5.534045,922.558360,1.773658,-259.18162,212.83
546,Gryffindor,Right,460.982123,-4.458888,4.960,,616.957607,-5.824107,948.921200,1.596308,-253.03560,188.01
...,...,...,...,...,...,...,...,...,...,...,...,...
26,Slytherin,Left,-419.022085,-6.655230,-6.006,-322.019869,408.628000,4.837897,1039.943499,8.739039,-253.04749,-91.44
824,Slytherin,Left,540.609246,5.384337,7.361,-605.516736,433.113823,4.111261,1050.700323,5.112833,-242.49289,21.15
1149,Slytherin,Right,-371.412954,-4.932472,-4.002,-222.728470,467.730624,4.046902,1011.281282,,-250.88995,-45.31
855,Slytherin,Right,-426.994638,-5.475914,-6.911,-413.841604,404.038342,4.109301,,10.305071,-252.39653,-58.66


Unnamed: 0,Hogwarts House,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
5,Slytherin,Right,-613.687160,-4.289197,-6.592,-440.997704,396.201804,5.380286,1052.845164,11.751212,-247.94549,-34.69
10,Hufflepuff,Right,604.933962,5.484189,5.358,-530.795896,484.872671,5.699654,1036.285357,9.293132,-242.69168,64.61
14,Ravenclaw,Right,-197.527318,2.742444,6.603,527.356323,605.590600,5.480097,1063.522361,9.407484,-232.65964,-19.94
16,Gryffindor,Left,470.653757,-5.518264,4.425,-434.293266,596.610089,-4.161823,967.223912,5.027415,-252.27344,174.27
19,Hufflepuff,Right,458.127026,6.981589,5.686,-579.668591,403.327690,4.265810,1052.035235,3.456519,-242.99904,3.81
...,...,...,...,...,...,...,...,...,...,...,...,...
1588,Slytherin,Right,-492.510311,-2.208650,-7.270,-622.936567,,2.201186,1069.012391,9.117247,-250.91192,-19.50
1589,Hufflepuff,Left,708.202206,4.850931,5.660,-504.777873,417.520448,4.568628,1046.345436,6.272776,-245.03263,28.70
1591,Slytherin,Right,-507.715746,-4.997610,-5.637,-443.781855,386.058513,6.990603,1019.526453,7.696268,-251.06254,-94.84
1598,Hufflepuff,Left,453.676219,3.442831,6.738,-831.741123,383.444937,3.813111,1087.949205,3.904100,-246.19072,-76.81


In [5]:
X_train = data_train.drop(columns=['Hogwarts House'])
data_train['Slytherin'] = (data_train['Hogwarts House'] == 'Slytherin').astype(int)
data_train['Hufflepuff'] = (data_train['Hogwarts House'] == 'Hufflepuff').astype(int)
data_train['Gryffindor'] = (data_train['Hogwarts House'] == 'Gryffindor').astype(int)
data_train['Ravenclaw'] = (data_train['Hogwarts House'] == 'Ravenclaw').astype(int)
Y_train = data_train[TARGET_CLASSES]
display(X_train)
display(Y_train)

Unnamed: 0,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
286,Left,651.499961,-4.316909,5.471,-499.027269,607.281653,-5.288429,942.123720,3.149411,-253.00483,223.19
147,Right,489.894253,-3.653055,6.509,-349.773327,627.385103,-4.990578,972.272775,4.661389,-248.89420,201.19
412,Left,509.980158,-7.440085,3.520,-546.684843,567.109244,-3.090562,935.334722,3.644900,-255.70319,161.23
556,Left,783.278014,-5.473951,2.576,,582.845083,-5.534045,922.558360,1.773658,-259.18162,212.83
546,Right,460.982123,-4.458888,4.960,,616.957607,-5.824107,948.921200,1.596308,-253.03560,188.01
...,...,...,...,...,...,...,...,...,...,...,...
26,Left,-419.022085,-6.655230,-6.006,-322.019869,408.628000,4.837897,1039.943499,8.739039,-253.04749,-91.44
824,Left,540.609246,5.384337,7.361,-605.516736,433.113823,4.111261,1050.700323,5.112833,-242.49289,21.15
1149,Right,-371.412954,-4.932472,-4.002,-222.728470,467.730624,4.046902,1011.281282,,-250.88995,-45.31
855,Right,-426.994638,-5.475914,-6.911,-413.841604,404.038342,4.109301,,10.305071,-252.39653,-58.66


Unnamed: 0,Slytherin,Hufflepuff,Gryffindor,Ravenclaw
286,0,0,1,0
147,0,0,1,0
412,0,0,1,0
556,0,0,1,0
546,0,0,1,0
...,...,...,...,...
26,1,0,0,0
824,1,0,0,0
1149,1,0,0,0
855,1,0,0,0


In [6]:
X_test = data_test.drop(columns=['Hogwarts House'])
data_test['Slytherin'] = (data_test['Hogwarts House'] == 'Slytherin').astype(int)
data_test['Hufflepuff'] = (data_test['Hogwarts House'] == 'Hufflepuff').astype(int)
data_test['Gryffindor'] = (data_test['Hogwarts House'] == 'Gryffindor').astype(int)
data_test['Ravenclaw'] = (data_test['Hogwarts House'] == 'Ravenclaw').astype(int)
Y_test = data_test[TARGET_CLASSES]
display(X_test)
display(Y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test['Slytherin'] = (data_test['Hogwarts House'] == 'Slytherin').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test['Hufflepuff'] = (data_test['Hogwarts House'] == 'Hufflepuff').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test['Gryffindor'] = (data_test['H

Unnamed: 0,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
5,Right,-613.687160,-4.289197,-6.592,-440.997704,396.201804,5.380286,1052.845164,11.751212,-247.94549,-34.69
10,Right,604.933962,5.484189,5.358,-530.795896,484.872671,5.699654,1036.285357,9.293132,-242.69168,64.61
14,Right,-197.527318,2.742444,6.603,527.356323,605.590600,5.480097,1063.522361,9.407484,-232.65964,-19.94
16,Left,470.653757,-5.518264,4.425,-434.293266,596.610089,-4.161823,967.223912,5.027415,-252.27344,174.27
19,Right,458.127026,6.981589,5.686,-579.668591,403.327690,4.265810,1052.035235,3.456519,-242.99904,3.81
...,...,...,...,...,...,...,...,...,...,...,...
1588,Right,-492.510311,-2.208650,-7.270,-622.936567,,2.201186,1069.012391,9.117247,-250.91192,-19.50
1589,Left,708.202206,4.850931,5.660,-504.777873,417.520448,4.568628,1046.345436,6.272776,-245.03263,28.70
1591,Right,-507.715746,-4.997610,-5.637,-443.781855,386.058513,6.990603,1019.526453,7.696268,-251.06254,-94.84
1598,Left,453.676219,3.442831,6.738,-831.741123,383.444937,3.813111,1087.949205,3.904100,-246.19072,-76.81


Unnamed: 0,Slytherin,Hufflepuff,Gryffindor,Ravenclaw
5,1,0,0,0
10,0,1,0,0
14,0,0,0,1
16,0,0,1,0
19,0,1,0,0
...,...,...,...,...
1588,1,0,0,0
1589,0,1,0,0
1591,1,0,0,0
1598,0,1,0,0


# Missing values

In [7]:
X = pd.concat([X_train, X_test])
Y = pd.concat([Y_train, Y_test])

In [8]:
X.isna().sum() / len(X) * 100

Best Hand           0.0000
Astronomy           2.0000
Herbology           2.0625
Divination          2.4375
Muggle Studies      2.1875
Ancient Runes       2.1875
History of Magic    2.6875
Transfiguration     2.1250
Potions             1.8750
Charms              0.0000
Flying              0.0000
dtype: float64

In [9]:
Y.isna().sum() / len(Y) * 100

Slytherin     0.0
Hufflepuff    0.0
Gryffindor    0.0
Ravenclaw     0.0
dtype: float64

In [10]:
(Y['Slytherin'] + Y['Gryffindor'] + Y['Ravenclaw'] + Y['Hufflepuff']).value_counts()

1    1600
Name: count, dtype: int64

# Preprocessings

In [11]:
from logreg_train import SimpleImputer, StandardScaler, OneHotEncoder, PreprocessorPipeline

imputer = SimpleImputer(NUMERICAL_COLS, CATEGORICAL_COLS)
scaler = StandardScaler(NUMERICAL_COLS)
ohe = OneHotEncoder(CATEGORICAL_COLS)
preprocessor = PreprocessorPipeline([imputer, scaler, ohe])
preprocessor

--- SimpleImputer ---
Means: None
Modes: None

--- StandardScaler ---
Means: None
Standard Deviations: None

--- OneHotEncoder ---
Columns mapping: {}
Drop Last: True


In [12]:
preprocessor.fit(X_train)
preprocessor

--- SimpleImputer ---
Means: {'Astronomy': 42.74944646734241, 'Herbology': 1.1681375073057356, 'Divination': 3.1483341858482525, 'Muggle Studies': -222.63875806526488, 'Ancient Runes': 495.59744013489546, 'Charms': -243.37200060833334, 'Potions': 5.936375733289444, 'Transfiguration': 1029.9973801339652, 'History of Magic': 2.9694525101169043, 'Flying': 22.22878333333333}
Modes: {'Best Hand': 'Right'}

--- StandardScaler ---
Means: {'Astronomy': 42.74944646734241, 'Herbology': 1.1681375073057356, 'Divination': 3.1483341858482525, 'Muggle Studies': -222.63875806526488, 'Ancient Runes': 495.59744013489546, 'Charms': -243.37200060833334, 'Potions': 5.936375733289444, 'Transfiguration': 1029.9973801339652, 'History of Magic': 2.9694525101169043, 'Flying': 22.22878333333333}
Standard Deviations: {'Astronomy': 522.0445230701418, 'Herbology': 5.1716238531017, 'Divination': 4.131015988193361, 'Muggle Studies': 485.3154883297681, 'Ancient Runes': 106.45761974759353, 'Charms': 8.728202354112108, 

In [13]:
X_train_preprocessed = preprocessor.transform(X_train)
X_train_preprocessed

Unnamed: 0,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,Best Hand_Left
286,1.166089,-1.060604,0.562251,-0.569503,1.049096,-1.868164,-2.000838,-0.887136,-1.103644,2.067327,1
147,0.856526,-0.932240,0.813520,-0.261963,1.237935,-1.800781,-1.314360,-0.405849,-0.632685,1.841008,0
412,0.895002,-1.664511,0.089970,-0.667702,0.671740,-1.370945,-2.155420,-0.729414,-1.412798,1.429932,1
556,1.418516,-1.284333,-0.138546,0.000000,0.819553,-1.923729,-2.446332,-1.325060,-1.811326,1.960751,1
546,0.801144,-1.088058,0.438552,0.000000,1.139986,-1.989349,-1.846063,-1.381514,-1.107169,1.705423,0
...,...,...,...,...,...,...,...,...,...,...,...
26,-0.884544,-1.512749,-2.216001,-0.204776,-0.816940,0.422694,0.226468,0.892133,-1.108532,-1.169333,1
824,0.953673,0.815256,1.019765,-0.788926,-0.586934,0.258309,0.471395,-0.262147,0.100721,-0.011098,1
1149,-0.793347,-1.179631,-1.730890,-0.000185,-0.261764,0.243749,-0.426156,0.000000,-0.861340,-0.694784,0
855,-0.899816,-1.284713,-2.435075,-0.393976,-0.860052,0.257866,0.000000,1.390626,-1.033951,-0.832118,0


In [14]:
from logreg_train import SortingHat

sorting_hat = SortingHat(X_train_preprocessed.shape[1], lr=LEARNING_RATE)
sorting_hat

<logreg_train.SortingHat at 0x7f8b6eb6e230>

In [15]:
sorting_hat.parameters

{'Slytherin': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'Hufflepuff': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'Gryffindor': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'Ravenclaw': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

In [16]:
X_test_preprocessed = preprocessor.transform(X_test)
X_test_preprocessed

Unnamed: 0,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,Best Hand_Left
5,-1.257434,-1.055246,-2.357854,-0.449932,-0.933664,0.545398,0.520232,1.850956,-0.523990,-0.585534,0
10,1.076890,0.834564,0.534896,-0.634963,-0.100742,0.617648,0.143174,1.068510,0.077945,0.435984,0
14,-0.460261,0.304412,0.836275,1.545376,1.033211,0.567978,0.763347,1.104910,1.227327,-0.433798,0
16,0.819670,-1.292902,0.309044,-0.436117,0.948853,-1.613294,-1.429320,-0.289337,-1.019848,1.564077,1
19,0.795675,1.124106,0.614296,-0.735665,-0.866728,0.293272,0.501791,-0.789378,0.042731,-0.189478,0
...,...,...,...,...,...,...,...,...,...,...,...
1588,-1.025314,-0.652945,-2.521979,-0.824820,0.000000,-0.173803,0.888352,1.012523,-0.863857,-0.429272,0
1589,1.274705,0.712115,0.608002,-0.581352,-0.733409,0.361778,0.372237,0.107082,-0.190260,0.066571,1
1591,-1.054441,-1.192227,-2.126676,-0.455669,-1.028944,0.909697,-0.238418,0.560202,-0.881114,-1.204309,0
1598,0.787149,0.439841,0.868955,-1.255065,-1.053494,0.190859,1.319533,-0.646906,-0.322944,-1.018831,1


In [17]:
Y_train

Unnamed: 0,Slytherin,Hufflepuff,Gryffindor,Ravenclaw
286,0,0,1,0
147,0,0,1,0
412,0,0,1,0
556,0,0,1,0
546,0,0,1,0
...,...,...,...,...
26,1,0,0,0
824,1,0,0,0
1149,1,0,0,0
855,1,0,0,0


In [18]:
for i in range(EPOCHS):
    sorting_hat.train_step(X_train_preprocessed, Y_train, X_test_preprocessed, Y_test)

In [19]:
data_train['pred'] = sorting_hat.predict(X_train_preprocessed)
houses = {0 : 'Slytherin', 1 : 'Hufflepuff', 2 : 'Gryffindor', 3 : 'Ravenclaw'}
data_train = data_train.replace({'pred':houses})
display(data_train)
data_train['true']=(data_train['Hogwarts House'] == data_train['pred'])
data_train['true'].sum() / len(data_train)

Unnamed: 0,Hogwarts House,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,Slytherin,Hufflepuff,Gryffindor,Ravenclaw,pred
286,Gryffindor,Left,651.499961,-4.316909,5.471,-499.027269,607.281653,-5.288429,942.123720,3.149411,-253.00483,223.19,0,0,1,0,Gryffindor
147,Gryffindor,Right,489.894253,-3.653055,6.509,-349.773327,627.385103,-4.990578,972.272775,4.661389,-248.89420,201.19,0,0,1,0,Gryffindor
412,Gryffindor,Left,509.980158,-7.440085,3.520,-546.684843,567.109244,-3.090562,935.334722,3.644900,-255.70319,161.23,0,0,1,0,Gryffindor
556,Gryffindor,Left,783.278014,-5.473951,2.576,,582.845083,-5.534045,922.558360,1.773658,-259.18162,212.83,0,0,1,0,Gryffindor
546,Gryffindor,Right,460.982123,-4.458888,4.960,,616.957607,-5.824107,948.921200,1.596308,-253.03560,188.01,0,0,1,0,Gryffindor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,Slytherin,Left,-419.022085,-6.655230,-6.006,-322.019869,408.628000,4.837897,1039.943499,8.739039,-253.04749,-91.44,1,0,0,0,Slytherin
824,Slytherin,Left,540.609246,5.384337,7.361,-605.516736,433.113823,4.111261,1050.700323,5.112833,-242.49289,21.15,1,0,0,0,Hufflepuff
1149,Slytherin,Right,-371.412954,-4.932472,-4.002,-222.728470,467.730624,4.046902,1011.281282,,-250.88995,-45.31,1,0,0,0,Slytherin
855,Slytherin,Right,-426.994638,-5.475914,-6.911,-413.841604,404.038342,4.109301,,10.305071,-252.39653,-58.66,1,0,0,0,Slytherin


0.9783333333333334

In [20]:
data_test = data.iloc[X_test.index]

In [21]:
data_test['pred'] = sorting_hat.predict(X_test_preprocessed)
houses = {0 : 'Slytherin', 1 : 'Hufflepuff', 2 : 'Gryffindor', 3 : 'Ravenclaw'}
data_test = data_test.replace({'pred':houses})
data_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test['pred'] = sorting_hat.predict(X_test_preprocessed)


Unnamed: 0,Hogwarts House,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,pred
5,Slytherin,Right,-613.687160,-4.289197,-6.592,-440.997704,396.201804,5.380286,1052.845164,11.751212,-247.94549,-34.69,Slytherin
10,Hufflepuff,Right,604.933962,5.484189,5.358,-530.795896,484.872671,5.699654,1036.285357,9.293132,-242.69168,64.61,Hufflepuff
14,Ravenclaw,Right,-197.527318,2.742444,6.603,527.356323,605.590600,5.480097,1063.522361,9.407484,-232.65964,-19.94,Ravenclaw
16,Gryffindor,Left,470.653757,-5.518264,4.425,-434.293266,596.610089,-4.161823,967.223912,5.027415,-252.27344,174.27,Gryffindor
19,Hufflepuff,Right,458.127026,6.981589,5.686,-579.668591,403.327690,4.265810,1052.035235,3.456519,-242.99904,3.81,Hufflepuff
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1588,Slytherin,Right,-492.510311,-2.208650,-7.270,-622.936567,,2.201186,1069.012391,9.117247,-250.91192,-19.50,Slytherin
1589,Hufflepuff,Left,708.202206,4.850931,5.660,-504.777873,417.520448,4.568628,1046.345436,6.272776,-245.03263,28.70,Hufflepuff
1591,Slytherin,Right,-507.715746,-4.997610,-5.637,-443.781855,386.058513,6.990603,1019.526453,7.696268,-251.06254,-94.84,Slytherin
1598,Hufflepuff,Left,453.676219,3.442831,6.738,-831.741123,383.444937,3.813111,1087.949205,3.904100,-246.19072,-76.81,Hufflepuff


In [22]:
losses = pd.DataFrame(sorting_hat.losses)
fig = go.Figure()
fig.add_trace(go.Line(x = losses['step'], y = losses['train_loss']))
fig.add_trace(go.Line(x = losses['step'], y = losses['test_loss']))
fig.show()


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [23]:
losses = pd.DataFrame(sorting_hat.losses)
fig = go.Figure()
fig.add_trace(go.Line(x = losses['step'][-100:], y = losses['train_loss'][-100:]))
fig.add_trace(go.Line(x = losses['step'][-100:], y = losses['test_loss'][-100:]))
fig.show()

In [24]:
data_train['Hogwarts House'].value_counts() / len(data_train)

Hogwarts House
Hufflepuff    0.330833
Ravenclaw     0.276667
Gryffindor    0.204167
Slytherin     0.188333
Name: count, dtype: float64

In [25]:
data_test['Hogwarts House'].value_counts() / len(data_test)

Hogwarts House
Hufflepuff    0.3300
Ravenclaw     0.2775
Gryffindor    0.2050
Slytherin     0.1875
Name: count, dtype: float64

In [26]:
data['Hogwarts House'].value_counts() / len(data)

Hogwarts House
Hufflepuff    0.330625
Ravenclaw     0.276875
Gryffindor    0.204375
Slytherin     0.188125
Name: count, dtype: float64

In [27]:
feature_importance_s = {X_train_preprocessed.columns[i]: sorting_hat.logregs['Slytherin'].weights[i] for i in range(X_train_preprocessed.shape[1])}
feature_importance_h = {X_train_preprocessed.columns[i]: sorting_hat.logregs['Hufflepuff'].weights[i] for i in range(X_train_preprocessed.shape[1])}
feature_importance_g = {X_train_preprocessed.columns[i]: sorting_hat.logregs['Gryffindor'].weights[i] for i in range(X_train_preprocessed.shape[1])}
feature_importance_r = {X_train_preprocessed.columns[i]: sorting_hat.logregs['Ravenclaw'].weights[i] for i in range(X_train_preprocessed.shape[1])}
feature_importance = pd.DataFrame([feature_importance_s, feature_importance_h, feature_importance_g, feature_importance_r],
                                  index=['Slytherin', 'Hufflepuff', 'Gryffindor', 'Ravenclaw']).T

feature_importance


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



Unnamed: 0,Slytherin,Hufflepuff,Gryffindor,Ravenclaw
Astronomy,-0.75239,1.410356,0.322726,-0.826135
Herbology,-0.748637,1.079419,-0.593724,0.480001
Divination,-1.245396,0.741093,0.276403,0.294891
Muggle Studies,-0.274027,-0.838913,-0.117724,1.088207
Ancient Runes,-0.445216,-1.195553,0.554635,0.868629
History of Magic,0.128746,0.617011,-0.728744,0.137347
Transfiguration,0.226895,0.544951,-0.781975,0.157149
Potions,0.612869,-0.410559,-0.309696,0.080671
Charms,-0.45028,-0.19666,-0.393449,1.04585
Flying,-0.498643,-0.322202,0.788376,-0.049758


In [28]:
data_test['true']=(data_test['Hogwarts House'] == data_test['pred'])
print(data_train['true'].sum() / len(data_train))
print(data_test['true'].sum() / len(data_test))

0.9783333333333334
0.9925


In [29]:
px.bar(feature_importance, labels={
                     "index": "Feature",
                     "value": "Importance"
                 })

# Training model on whole dataset

In [30]:
data = pd.read_csv("datasets/dataset_train.csv")
data['Birthday'] = pd.to_datetime(data['Birthday'])
data['Birthday Weekday'] = data['Birthday'].dt.dayofweek
data['Birthday Year'] = data['Birthday'].dt.year
data['Birthday Month'] = data['Birthday'].dt.month
data = data.drop(columns=COLS_TO_DROP)
data

Unnamed: 0,Hogwarts House,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
0,Ravenclaw,Left,-487.886086,5.727180,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,-232.79405,-26.89
1,Slytherin,Right,-552.060507,-5.987446,-5.612,-487.340557,367.760303,4.107170,1058.944592,7.248742,-252.18425,-113.45
2,Ravenclaw,Left,-366.076117,7.725017,6.140,664.893521,602.585284,3.555579,1088.088348,8.728531,-227.34265,30.42
3,Gryffindor,Left,697.742809,-6.497214,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-256.84675,200.64
4,Gryffindor,Left,436.775204,-7.820623,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-256.38730,157.98
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,Gryffindor,Right,354.280086,-4.541837,5.702,-497.235066,618.220213,-5.231721,964.219853,3.389086,-250.39401,185.83
1596,Slytherin,Left,367.531174,6.061064,1.757,-643.271092,445.827565,2.238112,1056.147366,5.825263,-246.42719,44.80
1597,Gryffindor,Right,544.018925,-3.203269,6.065,-385.150457,635.211486,-5.984257,953.866685,1.709808,-251.63679,198.47
1598,Hufflepuff,Left,453.676219,3.442831,6.738,-831.741123,383.444937,3.813111,1087.949205,3.904100,-246.19072,-76.81


In [31]:
X = data.drop(columns=['Hogwarts House'])
data['Slytherin'] = (data['Hogwarts House'] == 'Slytherin').astype(int)
data['Hufflepuff'] = (data['Hogwarts House'] == 'Hufflepuff').astype(int)
data['Gryffindor'] = (data['Hogwarts House'] == 'Gryffindor').astype(int)
data['Ravenclaw'] = (data['Hogwarts House'] == 'Ravenclaw').astype(int)
Y = data[TARGET_CLASSES]
display(X)
display(Y)

Unnamed: 0,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
0,Left,-487.886086,5.727180,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,-232.79405,-26.89
1,Right,-552.060507,-5.987446,-5.612,-487.340557,367.760303,4.107170,1058.944592,7.248742,-252.18425,-113.45
2,Left,-366.076117,7.725017,6.140,664.893521,602.585284,3.555579,1088.088348,8.728531,-227.34265,30.42
3,Left,697.742809,-6.497214,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-256.84675,200.64
4,Left,436.775204,-7.820623,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-256.38730,157.98
...,...,...,...,...,...,...,...,...,...,...,...
1595,Right,354.280086,-4.541837,5.702,-497.235066,618.220213,-5.231721,964.219853,3.389086,-250.39401,185.83
1596,Left,367.531174,6.061064,1.757,-643.271092,445.827565,2.238112,1056.147366,5.825263,-246.42719,44.80
1597,Right,544.018925,-3.203269,6.065,-385.150457,635.211486,-5.984257,953.866685,1.709808,-251.63679,198.47
1598,Left,453.676219,3.442831,6.738,-831.741123,383.444937,3.813111,1087.949205,3.904100,-246.19072,-76.81


Unnamed: 0,Slytherin,Hufflepuff,Gryffindor,Ravenclaw
0,0,0,0,1
1,1,0,0,0
2,0,0,0,1
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
1595,0,0,1,0
1596,1,0,0,0
1597,0,0,1,0
1598,0,1,0,0


In [32]:
imputer = SimpleImputer(NUMERICAL_COLS, CATEGORICAL_COLS)
scaler = StandardScaler(NUMERICAL_COLS)
ohe = OneHotEncoder(CATEGORICAL_COLS)
preprocessor = PreprocessorPipeline([imputer, scaler, ohe])
preprocessor

--- SimpleImputer ---
Means: None
Modes: None

--- StandardScaler ---
Means: None
Standard Deviations: None

--- OneHotEncoder ---
Columns mapping: {}
Drop Last: True


In [33]:
preprocessor.fit(X)
display(preprocessor)
X_preprocessed = preprocessor.transform(X)
X_preprocessed

--- SimpleImputer ---
Means: {'Astronomy': 39.79713089016475, 'Herbology': 1.1410195296768046, 'Divination': 3.1539096732863547, 'Muggle Studies': -224.58991486346417, 'Ancient Runes': 495.74797005915786, 'Charms': -243.3744090125, 'Potions': 5.950372992780089, 'Transfiguration': 1030.0969463871306, 'History of Magic': 2.9630946151165936, 'Flying': 21.9580125}
Modes: {'Best Hand': 'Right'}

--- StandardScaler ---
Means: {'Astronomy': 39.79713089016475, 'Herbology': 1.1410195296768046, 'Divination': 3.1539096732863547, 'Muggle Studies': -224.58991486346417, 'Ancient Runes': 495.74797005915786, 'Charms': -243.3744090125, 'Potions': 5.950372992780089, 'Transfiguration': 1030.0969463871306, 'History of Magic': 2.9630946151165936, 'Flying': 21.9580125}
Standard Deviations: {'Astronomy': 520.2982676051708, 'Herbology': 5.2196819935318235, 'Divination': 4.155300897977581, 'Muggle Studies': 486.34483965206664, 'Ancient Runes': 106.28516457845274, 'Charms': 8.783639876017117, 'Potions': 3.14785

Unnamed: 0,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,Best Hand_Left
0,-1.014194,0.878628,0.377371,1.021139,0.345639,0.512444,0.219633,-0.686183,1.204553,-0.500330,1
1,-1.137535,-1.365690,-2.109573,-0.540256,-1.204191,0.258503,0.653769,0.412462,-1.002983,-1.386928,0
2,-0.780078,1.261379,0.718622,1.828915,1.005195,0.133871,1.314249,0.882556,1.825184,0.086673,1
3,1.264555,-1.463352,0.209874,-0.642366,0.265645,-1.756242,-2.486237,-1.629193,-1.533799,1.830165,1
4,0.762982,-1.716894,-0.220901,-0.451681,0.974516,-1.447763,-2.099988,-0.520770,-1.481492,1.393217,1
...,...,...,...,...,...,...,...,...,...,...,...
1595,0.604428,-1.088736,0.613214,-0.560600,1.152299,-1.851612,-1.492961,-0.813661,-0.799168,1.678473,0
1596,0.629896,0.942595,-0.336175,-0.860873,-0.469684,-0.163809,0.590376,-0.039745,-0.347553,0.233961,1
1597,0.969101,-0.832290,0.700573,-0.330137,1.312164,-2.021646,-1.727593,-1.347129,-0.940656,1.807939,0
1598,0.795465,0.440987,0.862534,-1.248397,-1.056620,0.192060,1.311096,-0.650053,-0.320631,-1.011640,1


In [34]:
sorting_hat = SortingHat(X_preprocessed.shape[1], lr=LEARNING_RATE)
for i in range(EPOCHS):
    sorting_hat.train_step(X_preprocessed, Y)

In [35]:
Y

Unnamed: 0,Slytherin,Hufflepuff,Gryffindor,Ravenclaw
0,0,0,0,1
1,1,0,0,0
2,0,0,0,1
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
1595,0,0,1,0
1596,1,0,0,0
1597,0,0,1,0
1598,0,1,0,0


In [36]:
sorting_hat.predict(X_preprocessed)

0        Ravenclaw
1        Slytherin
2        Ravenclaw
3       Gryffindor
4       Gryffindor
           ...    
1595    Gryffindor
1596    Hufflepuff
1597    Gryffindor
1598    Hufflepuff
1599    Hufflepuff
Length: 1600, dtype: object

In [37]:
data['pred'] = sorting_hat.predict(X_preprocessed)
houses = {0 : 'Slytherin', 1 : 'Hufflepuff', 2 : 'Gryffindor', 3 : 'Ravenclaw'}
data = data.replace({'pred':houses})
display(data.loc[X_preprocessed.index[0], :])
data['true']=(data['Hogwarts House'] == data['pred'])
data['true'].sum() / len(data)

Hogwarts House        Ravenclaw
Best Hand                  Left
Astronomy           -487.886086
Herbology               5.72718
Divination                4.722
Muggle Studies       272.035831
Ancient Runes        532.484226
History of Magic       5.231058
Transfiguration     1039.788281
Potions                3.790369
Charms               -232.79405
Flying                   -26.89
Slytherin                     0
Hufflepuff                    0
Gryffindor                    0
Ravenclaw                     1
pred                  Ravenclaw
Name: 0, dtype: object

0.981875

In [38]:
losses = pd.DataFrame(sorting_hat.losses)
fig = go.Figure()
fig.add_trace(go.Line(x = losses['step'], y = losses['train_loss']))
fig.show()


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [39]:
losses = pd.DataFrame(sorting_hat.losses)
fig = go.Figure()
fig.add_trace(go.Line(x = losses['step'][-100:], y = losses['train_loss'][-100:]))
fig.show()

# Predictions

In [40]:
test = pd.read_csv("datasets/dataset_test.csv")
test['Birthday'] = pd.to_datetime(test['Birthday'])
test['Birthday Weekday'] = test['Birthday'].dt.dayofweek
test['Birthday Year'] = test['Birthday'].dt.year
test['Birthday Month'] = test['Birthday'].dt.month
test = test.drop(columns=COLS_TO_DROP + ['Hogwarts House'])
test

Unnamed: 0,Best Hand,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying
0,Right,696.096071,3.020172,7.996,-365.151850,393.138185,4.207691,1046.742736,3.668983,-244.48172,-13.62
1,Left,-370.844655,2.965226,6.349,522.580486,602.853051,6.460017,1048.053878,8.514622,-231.29200,-26.26
2,Left,320.303990,-6.185697,4.619,-630.073207,588.071795,-5.565818,936.437358,1.850829,-252.99343,200.15
3,Right,407.202928,4.962442,,-449.179806,427.699966,,1043.397718,4.656573,-244.01660,-11.15
4,Right,288.337747,3.737656,4.886,-449.732166,385.712782,2.876347,1051.377936,2.750586,-243.99806,-7.12
...,...,...,...,...,...,...,...,...,...,...,...
395,Left,-554.181932,-5.647655,-3.799,-591.764651,392.973420,7.048482,1047.648405,10.408749,-248.39978,-94.89
396,Left,632.233530,6.754862,3.294,-221.848397,319.360250,3.921402,1035.681313,-0.169741,-246.87982,-15.53
397,Right,292.108738,5.234530,4.230,-787.036050,433.259967,3.898160,1069.794110,6.495579,-244.01333,1.25
398,Left,-726.418553,6.735582,3.908,511.960762,613.391514,7.244499,1042.058804,7.554259,-228.24290,-18.27


In [41]:
test_preprocessed = preprocessor.transform(test)
test_preprocessed

Unnamed: 0,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,Best Hand_Left
0,1.261390,0.360013,1.165280,-0.289017,-0.965420,0.281215,0.377241,-0.724744,-0.126065,-0.364411,0
1,-0.789243,0.349486,0.768919,1.536298,1.007714,0.790127,0.406955,0.814602,1.375558,-0.493877,1
2,0.539127,-1.403671,0.352583,-0.833736,0.868643,-1.927101,-2.122591,-1.302330,-1.095106,1.825147,1
3,0.706145,0.732118,0.000000,-0.461791,-0.640240,0.000000,0.301433,-0.411010,-0.073112,-0.339112,0
4,0.477689,0.497470,0.416839,-0.462927,-1.035283,-0.019601,0.482287,-1.016498,-0.071001,-0.297834,0
...,...,...,...,...,...,...,...,...,...,...,...
395,-1.141613,-1.300592,-1.673263,-0.754968,-0.966970,0.923090,0.397766,1.416322,-0.572129,-1.196826,1
396,1.138648,1.075514,0.033714,0.005637,-1.659570,0.216529,0.126558,-1.944218,-0.399084,-0.383974,1
397,0.484936,0.784245,0.258968,-1.156476,-0.587928,0.211277,0.899650,0.173199,-0.072740,-0.212104,0
398,-1.472647,1.071821,0.181477,1.514462,1.106867,0.967380,0.271090,0.509517,1.722692,-0.412039,1


In [42]:
sorting_hat.predict(test_preprocessed)

0      Hufflepuff
1       Ravenclaw
2      Gryffindor
3      Hufflepuff
4      Hufflepuff
          ...    
395     Slytherin
396    Hufflepuff
397    Hufflepuff
398     Ravenclaw
399     Ravenclaw
Length: 400, dtype: object

In [43]:
predictions = sorting_hat.predict(test_preprocessed)
houses = {0 : 'Slytherin', 1 : 'Hufflepuff', 2 : 'Gryffindor', 3 : 'Ravenclaw'}
predictions = predictions.replace(houses)
display(predictions)

0      Hufflepuff
1       Ravenclaw
2      Gryffindor
3      Hufflepuff
4      Hufflepuff
          ...    
395     Slytherin
396    Hufflepuff
397    Hufflepuff
398     Ravenclaw
399     Ravenclaw
Length: 400, dtype: object

In [44]:
truth = pd.read_csv("~/Downloads/dataset_truth.csv")
truth

Unnamed: 0,Index,Hogwarts House
0,0,Hufflepuff
1,1,Ravenclaw
2,2,Gryffindor
3,3,Hufflepuff
4,4,Hufflepuff
...,...,...
395,395,Slytherin
396,396,Hufflepuff
397,397,Hufflepuff
398,398,Ravenclaw


In [45]:
truth['pred'] = predictions

In [46]:
truth

Unnamed: 0,Index,Hogwarts House,pred
0,0,Hufflepuff,Hufflepuff
1,1,Ravenclaw,Ravenclaw
2,2,Gryffindor,Gryffindor
3,3,Hufflepuff,Hufflepuff
4,4,Hufflepuff,Hufflepuff
...,...,...,...
395,395,Slytherin,Slytherin
396,396,Hufflepuff,Hufflepuff
397,397,Hufflepuff,Hufflepuff
398,398,Ravenclaw,Ravenclaw


In [47]:
len(truth[truth['pred'] == truth['Hogwarts House']]) / len(truth) * 100

99.0

In [48]:
sorting_hat.parameters

{'Slytherin': array([-0.77339645, -0.76649033, -1.26075815, -0.29054434, -0.46186801,
         0.15432559,  0.23230334,  0.62805899, -0.46196045, -0.52079843,
         0.00496561, -3.05793273]),
 'Hufflepuff': array([ 1.45628288,  1.13594952,  0.75581449, -0.86375488, -1.2640288 ,
         0.63069566,  0.58179267, -0.46335452, -0.19801112, -0.33259835,
        -0.0073767 , -1.68661418]),
 'Gryffindor': array([ 0.32063395, -0.60749732,  0.2611091 , -0.13562068,  0.57398496,
        -0.76611252, -0.81569793, -0.30917329, -0.4089485 ,  0.8140605 ,
        -0.01742464, -3.02428838]),
 'Ravenclaw': array([-0.84175664,  0.47367433,  0.29832954,  1.1223061 ,  0.88251389,
         0.1630664 ,  0.17893637,  0.11926481,  1.06425102, -0.07386469,
         0.00311705, -2.30400271])}

In [49]:
import json

model = {X_preprocessed.columns[i]: sorting_hat.weights[i] for i in range(X_preprocessed.shape[1])}
model['bias'] = sorting_hat.bias

model

AttributeError: 'SortingHat' object has no attribute 'weights'