In [21]:


import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

pd.options.plotting.backend = "plotly"

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from plotly.figure_factory import create_distplot
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error,explained_variance_score
from xgboost import XGBRegressor

import os
for dirname, _, filenames in os.walk('/Users/tjeff/Desktop/CODING/project-4/Resources/test_scores.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data = pd.read_csv("/Users/tjeff/Desktop/CODING/project-4/Resources/test_scores.csv")


In [22]:
print("The data contains",data.shape[0],"rows and",data.shape[1], "columns\n")
#print head of data
data.head()

The data contains 2133 rows and 11 columns



Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,student_id,gender,lunch,pretest,posttest
0,ANKYI,Urban,Non-public,6OL,Standard,20.0,2FHT3,Female,Does not qualify,62.0,72.0
1,ANKYI,Urban,Non-public,6OL,Standard,20.0,3JIVH,Female,Does not qualify,66.0,79.0
2,ANKYI,Urban,Non-public,6OL,Standard,20.0,3XOWE,Male,Does not qualify,64.0,76.0
3,ANKYI,Urban,Non-public,6OL,Standard,20.0,556O0,Female,Does not qualify,61.0,77.0
4,ANKYI,Urban,Non-public,6OL,Standard,20.0,74LOE,Male,Does not qualify,64.0,76.0


In [23]:
data.info()

print("-"*50)


print("Any null values in the data:",data.isna().sum().any())

print("-"*50)


for col_name in data:
    if data[col_name].dtype == "object":
        print(col_name,":",data[col_name].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2133 entries, 0 to 2132
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   school           2133 non-null   object 
 1   school_setting   2133 non-null   object 
 2   school_type      2133 non-null   object 
 3   classroom        2133 non-null   object 
 4   teaching_method  2133 non-null   object 
 5   n_student        2133 non-null   float64
 6   student_id       2133 non-null   object 
 7   gender           2133 non-null   object 
 8   lunch            2133 non-null   object 
 9   pretest          2133 non-null   float64
 10  posttest         2133 non-null   float64
dtypes: float64(3), object(8)
memory usage: 183.4+ KB
--------------------------------------------------
Any null values in the data: False
--------------------------------------------------
school : ['ANKYI' 'CCAAW' 'CIMBB' 'CUQAM' 'DNQDD' 'FBUMG' 'GJJHK' 'GOKXL' 'GOOBU'
 'IDGFP' 'KFZMY' 'KZKK

In [24]:


fig = make_subplots(
    rows=3, cols=2, subplot_titles=("School Setting distribution", "School Type Distribution",'Teaching method distribution','Gender distribution','Lunch distribution','Relationship Between Post-test and Pre-test')
)


fig.add_trace(go.Bar(x=data['school_setting'].value_counts().index.values, y=data['school_setting'].value_counts().values/np.sum(data['school_setting'].value_counts())), row=1, col=1)
fig.add_trace(go.Bar(x=data['school_type'].value_counts().index.values, y=data['school_type'].value_counts().values/np.sum(data['school_type'].value_counts())), row=1, col=2)
fig.add_trace(go.Bar(x=data['teaching_method'].value_counts().index.values, y=data['teaching_method'].value_counts().values/np.sum(data['teaching_method'].value_counts())), row=2, col=1)
fig.add_trace(go.Bar(x=data['gender'].value_counts().index.values, y=data['gender'].value_counts().values/np.sum(data['gender'].value_counts())), row=2, col=2)
fig.add_trace(go.Bar(x=data['lunch'].value_counts().index.values, y=data['lunch'].value_counts().values/np.sum(data['lunch'].value_counts())), row=3, col=1)
fig.add_trace(go.Scatter(x=data['pretest'], y=data['posttest'], mode='markers'), row=3, col=2)


fig.update_xaxes(title_text="School Setting", row=1, col=1)
fig.update_xaxes(title_text="School Type", row=1, col=2)
fig.update_xaxes(title_text="Teaching Method", row=2, col=1)
fig.update_xaxes(title_text="Gender", row=2, col=2)
fig.update_xaxes(title_text="lunch", row=3, col=1)
fig.update_xaxes(title_text="Pretest", row=3, col=2)



fig.update_yaxes(title_text="Percent of total",tickformat = ',.0%',row=1, col=1)
fig.update_yaxes(title_text="Percent of total",tickformat = ',.0%', row=1, col=2)
fig.update_yaxes(title_text="Percent of total",tickformat = ',.0%', row=2, col=1)
fig.update_yaxes(title_text="Percent of total",tickformat = ',.0%', row=2, col=2)
fig.update_yaxes(title_text="Percent of total",tickformat = ',.0%', row=3, col=1)
fig.update_yaxes(title_text="Posttest",row=3, col=2)



fig.update_layout(title_text="Ploting Features", height=700)

fig.show()

In [25]:
hist_data = [data['pretest'], data['posttest']]

group_labels = ['pretest', 'posttest']


fig_distributions = ff.create_distplot(hist_data, group_labels, bin_size=.5, show_rug=False)
fig_distributions.update_layout(title_text="Posttest and Pretest Distribution", height=500)
fig_distributions.show()

In [26]:
def create_distplot(column_name):
    
    fig = ff.create_distplot([data['posttest'][data[column_name] == c].values for c in data[column_name].unique()],
    data[column_name].unique(),
    show_hist=True,
    show_rug=False,bin_size=.5
        )
    title_text_1 = "Posttest Results by School " + column_name
    fig.update_layout(title_text= title_text_1, height=500)
    fig.show()

In [27]:
for col_name in data:
    if data[col_name].dtype == "object" and col_name not in ['student_id','classroom']:
        create_distplot(col_name)

In [28]:
X = data.drop(['posttest','student_id'], axis=1)
y = data['posttest']

#listing cateogorical and number features for pipeline
categorical_features = list(X.select_dtypes(include=['object']))
numeric_features = list(X.select_dtypes(include=['float64']))

#specificing preprocessing steps in pipeline
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),('cat', categorical_transformer, categorical_features)])

#spliting dataset in test and train
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.30, random_state=42)


In [29]:
def model_evaluation(y_test, y_predicted):
    
    
    MAE = mean_absolute_error(y_test, y_predicted)
    MSE = mean_squared_error(y_test, y_predicted)
    r2 = r2_score(y_test, y_predicted)
    exp_var_score = explained_variance_score(y_test, y_predicted)
    
    print(f"Mean absolute error: {round(MAE, 3)}\nMean squared error: {round(MSE, 3)}\nR2: {round(r2, 3)}\nExplained Variance Score:{(exp_var_score)}")


In [30]:
def check_overfitting_scatter(y_test,y_predict,model):
    fig = px.scatter(x = y_test, y = y_hat, trendline = 'ols',labels=dict(x='True scores', y='Predicted scores'),title=model)
    fig.show()

In [31]:
Linear_regr = Pipeline(steps=[('preprocessor', preprocessor),('regr', LinearRegression())])
Linear_regr.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['n_student', 'pretest']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['school', 'school_setting',
                                                   'school_type', 'classroom',
                                                   'teaching_method', 'gender',
                                                   'lunch'])])),
                ('regr', LinearRegression())])

In [32]:
y_hat = Linear_regr.predict(x_test)


model_evaluation(y_test,y_hat)


check_overfitting_scatter(y_test,y_hat,'Linear Regression:Predicted Scores vs. True Scores')

print("Model score:",Linear_regr.score(x_train, y_train))

Mean absolute error: 2.253
Mean squared error: 8.194
R2: 0.958
Explained Variance Score:0.9578339130797362


Model score: 0.9626951975835959


In [33]:
xgboost = Pipeline(steps=[('preprocessor', preprocessor),('xgboost', XGBRegressor(verbosity=0))])
xgboost.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['n_student', 'pretest']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['school', 'school_setting',
                                                   'school_type', 'classroom',
                                                   'teaching_method', 'gender',
                                                   'lunch'])])),
                ('xgboost',
                 XGBRegressor(base_score=0.5, b...
                              gamma=0, gpu

In [34]:
print("Model score:",xgboost.score(x_train, y_train))


y_hat = xgboost.predict(x_test)

model_evaluation(y_test,y_hat)

check_overfitting_scatter(y_test,y_hat,'Xgboost: Plotting predicted vs true scores')

Model score: 0.981794330012139
Mean absolute error: 2.563
Mean squared error: 10.44
R2: 0.946
Explained Variance Score:0.9463547901456592
