In [109]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [110]:
df = pd.read_csv('test_scores.csv')
df.head()

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,student_id,gender,lunch,pretest,posttest
0,ANKYI,Urban,Non-public,6OL,Standard,20.0,2FHT3,Female,Does not qualify,62.0,72.0
1,ANKYI,Urban,Non-public,6OL,Standard,20.0,3JIVH,Female,Does not qualify,66.0,79.0
2,ANKYI,Urban,Non-public,6OL,Standard,20.0,3XOWE,Male,Does not qualify,64.0,76.0
3,ANKYI,Urban,Non-public,6OL,Standard,20.0,556O0,Female,Does not qualify,61.0,77.0
4,ANKYI,Urban,Non-public,6OL,Standard,20.0,74LOE,Male,Does not qualify,64.0,76.0


In [111]:
df = df.drop(columns = ["school", "classroom", "student_id"])
df.head()

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0


In [112]:
df["percent_change"] = ((df['posttest'] - df['pretest']) / df['posttest']) * 100
df.head(10)

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest,percent_change
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0,13.888889
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0,16.455696
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0,20.779221
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474
5,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,74.0,10.810811
6,Urban,Non-public,Standard,20.0,Male,Does not qualify,63.0,75.0,16.0
7,Urban,Non-public,Standard,20.0,Female,Does not qualify,63.0,72.0,12.5
8,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,77.0,16.883117
9,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,72.0,15.277778


In [113]:
# add a blank column
df["High_Low_Growth"] = ''
df.head()

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest,percent_change,High_Low_Growth
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0,13.888889,
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0,16.455696,
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0,20.779221,
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,


In [114]:
# apply a cut-off on percent change
def growth_rating(x):
    if x >= 15:
        result = "High Growth"
    else:
        result = "Low Growth"
    return result


df['High_Low_Growth'] = df['percent_change'].apply(growth_rating)
df.head()

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest,percent_change,High_Low_Growth
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0,13.888889,Low Growth
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0,16.455696,High Growth
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,High Growth
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0,20.779221,High Growth
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,High Growth


In [115]:
df.count()

school_setting     2133
school_type        2133
teaching_method    2133
n_student          2133
gender             2133
lunch              2133
pretest            2133
posttest           2133
percent_change     2133
High_Low_Growth    2133
dtype: int64

In [116]:
df['High_Low_Growth'].value_counts()

High Growth    1447
Low Growth      686
Name: High_Low_Growth, dtype: int64

In [117]:
urban_df = df.loc[df['school_setting'] == 'Suburban']
urban_df.head()

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest,percent_change,High_Low_Growth
41,Suburban,Non-public,Experimental,18.0,Female,Does not qualify,61.0,75.0,18.666667,High Growth
42,Suburban,Non-public,Experimental,18.0,Male,Qualifies for reduced/free lunch,58.0,78.0,25.641026,High Growth
43,Suburban,Non-public,Experimental,18.0,Female,Qualifies for reduced/free lunch,64.0,82.0,21.95122,High Growth
44,Suburban,Non-public,Experimental,18.0,Male,Qualifies for reduced/free lunch,58.0,77.0,24.675325,High Growth
45,Suburban,Non-public,Experimental,18.0,Male,Does not qualify,65.0,87.0,25.287356,High Growth


In [118]:
urban_df.count()

school_setting     717
school_type        717
teaching_method    717
n_student          717
gender             717
lunch              717
pretest            717
posttest           717
percent_change     717
High_Low_Growth    717
dtype: int64

In [119]:
X = urban_df.drop(columns = ["High_Low_Growth", "percent_change", "posttest", "pretest", "school_setting"])
X = pd.get_dummies(X)

X.head()

Unnamed: 0,n_student,school_type_Non-public,school_type_Public,teaching_method_Experimental,teaching_method_Standard,gender_Female,gender_Male,lunch_Does not qualify,lunch_Qualifies for reduced/free lunch
41,18.0,1,0,1,0,1,0,1,0
42,18.0,1,0,1,0,0,1,0,1
43,18.0,1,0,1,0,1,0,0,1
44,18.0,1,0,1,0,0,1,0,1
45,18.0,1,0,1,0,0,1,1,0


In [120]:
y = urban_df.loc[:, "High_Low_Growth"].copy()

In [121]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1 
                                                    )

In [122]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Resample the training data with the BalancedRandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1) 

# fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# make prediction
predictions = rf_model.predict(X_test_scaled)

In [123]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.6935731660478812

In [124]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual High_Growth", "Actual Low_Growth"], columns=["Predicted High_Growth", "Predicted Low_Growth"])

cm_df

Unnamed: 0,Predicted High_Growth,Predicted Low_Growth
Actual High_Growth,81,26
Actual Low_Growth,27,46


In [125]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

High Growth       0.75      0.76      0.63      0.75      0.69      0.48       107
 Low Growth       0.64      0.63      0.76      0.63      0.69      0.47        73

avg / total       0.70      0.71      0.68      0.71      0.69      0.48       180



In [126]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2894695737173753, 'n_student'),
 (0.2501762742233606, 'teaching_method_Experimental'),
 (0.22352169666109953, 'teaching_method_Standard'),
 (0.08398827029097457, 'lunch_Qualifies for reduced/free lunch'),
 (0.07997348470980509, 'lunch_Does not qualify'),
 (0.01956159284461639, 'gender_Female'),
 (0.019500720757985406, 'gender_Male'),
 (0.018123276142580493, 'school_type_Public'),
 (0.01568511065220269, 'school_type_Non-public')]