In [None]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('sample_data/Expanded_data_with_more_features.csv')

In [40]:
df.head()

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75


In [13]:
# 1
female = len(df[df['Gender'] == 'female'])
male = len(df[df['Gender'] == 'male'])
female, male

(15424, 15217)

In [14]:
# 2, 3
fem_percentage = (female / len(df)) * 100
male_percentage = (male / len(df)) * 100
fem_percentage, male_percentage

(50.337782709441605, 49.6622172905584)

In [39]:
# 4, 5, 6
math_df = df.groupby('EthnicGroup')['MathScore'].mean()
math_res = math_df.keys()[math_df.argmax()]

read_df = df.groupby('EthnicGroup')['ReadingScore'].mean()
read_res = math_df.keys()[math_df.argmax()]

write_df = df.groupby('EthnicGroup')['WritingScore'].mean()
write_res = math_df.keys()[math_df.argmax()]

math_res, read_res, write_res

('group E', 'group E', 'group E')

In [29]:
# 7, 8, 9
mean_math = df[df["TestPrep"] == 'completed']["MathScore"].mean()
mean_read = df[df["TestPrep"] == 'completed']["ReadingScore"].mean()
mean_write = df[df["TestPrep"] == 'completed']["WritingScore"].mean()
mean = (mean_math + mean_read + mean_write) / 3
mean_math, mean_read, mean

(69.5466599698644, 73.73299849321948, 72.66097438473129)

In [19]:
# 10, 11
single = len(df[df['ParentMaritalStatus'] == 'single'])
married = len(df[df['ParentMaritalStatus'] == 'married'])
single, married

(7097, 16844)

In [26]:
# 12, 13
regullary_math = df[df["PracticeSport"] == 'regularly']["MathScore"].mean()
never_math = df[df["PracticeSport"] == 'never']["MathScore"].mean()

regullary_read = df[df["PracticeSport"] == 'regularly']["ReadingScore"].mean()
never_read = df[df["PracticeSport"] == 'never']["ReadingScore"].mean()
regullary_math > never_math, regullary_read > never_read

(True, True)

In [23]:
# 14, 15
bus = len(df[df['TransportMeans'] == 'school_bus'])
private = len(df[df['TransportMeans'] == 'private'])
bus, private

(16145, 11362)

In [41]:
# Classification

In [50]:
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

In [57]:
cols = [
    'Gender', 
    'EthnicGroup', 
    'ParentEduc', 
    'LunchType', 
    'ParentMaritalStatus', 
    'PracticeSport', 
    'IsFirstChild', 
    'NrSiblings',       
    'TransportMeans', 
    'WklyStudyHours', 
    'MathScore', 
    'ReadingScore', 
    'WritingScore'
]

In [58]:
df['Outcome'] = df['TestPrep'].isin(['completed'])
df[cols + ['Outcome']]

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore,Outcome
0,female,,bachelor's degree,standard,married,regularly,yes,3.0,school_bus,< 5,71,71,74,False
1,female,group C,some college,standard,married,sometimes,yes,0.0,,5 - 10,69,90,88,False
2,female,group B,master's degree,standard,single,sometimes,yes,4.0,school_bus,< 5,87,93,91,False
3,male,group A,associate's degree,free/reduced,married,never,no,1.0,,5 - 10,45,56,42,False
4,male,group C,some college,standard,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30636,female,group D,high school,standard,single,sometimes,no,2.0,school_bus,5 - 10,59,61,65,False
30637,male,group E,high school,standard,single,regularly,no,1.0,private,5 - 10,58,53,51,False
30638,female,,high school,free/reduced,married,sometimes,no,1.0,private,5 - 10,61,70,67,True
30639,female,group D,associate's degree,standard,married,regularly,no,3.0,school_bus,5 - 10,82,90,93,True


In [63]:
num_features = [
    'MathScore', 
    'ReadingScore', 
    'WritingScore'
]
cat_features = [
    'Gender', 
    #'EthnicGroup', 
    'ParentEduc', 
    'LunchType', 
    'ParentMaritalStatus', 
    'PracticeSport',
    'IsFirstChild',
    'TransportMeans',
    'WklyStudyHours',
    'NrSiblings'
]

In [67]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('scaler', StandardScaler()),
])

pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", OneHotEncoder(), cat_features)
])

In [68]:
X = pipeline.fit_transform(df)
Y = df['IsTestCompleted'].values

In [69]:
cv = ShuffleSplit(n_splits=3, test_size=0.1)
cross_val_score(LogisticRegression(), X, Y, cv=cv).mean()

0.7510603588907014