In [95]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [96]:
df = pd.read_csv('test_scores.csv')
df.head()

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,student_id,gender,lunch,pretest,posttest
0,ANKYI,Urban,Non-public,6OL,Standard,20.0,2FHT3,Female,Does not qualify,62.0,72.0
1,ANKYI,Urban,Non-public,6OL,Standard,20.0,3JIVH,Female,Does not qualify,66.0,79.0
2,ANKYI,Urban,Non-public,6OL,Standard,20.0,3XOWE,Male,Does not qualify,64.0,76.0
3,ANKYI,Urban,Non-public,6OL,Standard,20.0,556O0,Female,Does not qualify,61.0,77.0
4,ANKYI,Urban,Non-public,6OL,Standard,20.0,74LOE,Male,Does not qualify,64.0,76.0


In [97]:
df = df.drop(columns = ["school", "classroom", "student_id"])
df.head()

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0


In [98]:
df["percent_change"] = ((df['posttest'] - df['pretest']) / df['posttest']) * 100
df.head(10)

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest,percent_change
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0,13.888889
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0,16.455696
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0,20.779221
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474
5,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,74.0,10.810811
6,Urban,Non-public,Standard,20.0,Male,Does not qualify,63.0,75.0,16.0
7,Urban,Non-public,Standard,20.0,Female,Does not qualify,63.0,72.0,12.5
8,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,77.0,16.883117
9,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,72.0,15.277778


In [99]:
# add a blank column
df["High_Low_Growth"] = ''
df.head()

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest,percent_change,High_Low_Growth
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0,13.888889,
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0,16.455696,
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0,20.779221,
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,


In [100]:
# apply a cut-off on percent change
def growth_rating(x):
    if x >= 15:
        result = "High Growth"
    else:
        result = "Low Growth"
    return result


df['High_Low_Growth'] = df['percent_change'].apply(growth_rating)
df.head()

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest,percent_change,High_Low_Growth
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0,13.888889,Low Growth
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0,16.455696,High Growth
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,High Growth
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0,20.779221,High Growth
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,High Growth


In [101]:
df.count()

school_setting     2133
school_type        2133
teaching_method    2133
n_student          2133
gender             2133
lunch              2133
pretest            2133
posttest           2133
percent_change     2133
High_Low_Growth    2133
dtype: int64

In [102]:
df['High_Low_Growth'].value_counts()

High Growth    1447
Low Growth      686
Name: High_Low_Growth, dtype: int64

In [103]:
X = df.drop(columns = ["High_Low_Growth", "percent_change", "posttest"])
X = pd.get_dummies(X)

X.head()

Unnamed: 0,n_student,pretest,school_setting_Rural,school_setting_Suburban,school_setting_Urban,school_type_Non-public,school_type_Public,teaching_method_Experimental,teaching_method_Standard,gender_Female,gender_Male,lunch_Does not qualify,lunch_Qualifies for reduced/free lunch
0,20.0,62.0,0,0,1,1,0,0,1,1,0,1,0
1,20.0,66.0,0,0,1,1,0,0,1,1,0,1,0
2,20.0,64.0,0,0,1,1,0,0,1,0,1,1,0
3,20.0,61.0,0,0,1,1,0,0,1,1,0,1,0
4,20.0,64.0,0,0,1,1,0,0,1,0,1,1,0


In [104]:
y = df.loc[:, "High_Low_Growth"].copy()

In [105]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1 
                                                    )

In [106]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Resample the training data with the BalancedRandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1) 

# fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# make prediction
predictions = rf_model.predict(X_test_scaled)

In [107]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.7316862280434887

In [108]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual High_Growth", "Actual Low_Growth"], columns=["Predicted High_Growth", "Predicted Low_Growth"])

cm_df

Unnamed: 0,Predicted High_Growth,Predicted Low_Growth
Actual High_Growth,305,56
Actual Low_Growth,66,107


In [109]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

High Growth       0.82      0.84      0.62      0.83      0.72      0.53       361
 Low Growth       0.66      0.62      0.84      0.64      0.72      0.51       173

avg / total       0.77      0.77      0.69      0.77      0.72      0.53       534



In [110]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5434230084569486, 'pretest'),
 (0.14315170608782066, 'n_student'),
 (0.08383951070161032, 'teaching_method_Standard'),
 (0.08193537185945965, 'teaching_method_Experimental'),
 (0.028770227278290074, 'lunch_Qualifies for reduced/free lunch'),
 (0.024092175979176872, 'lunch_Does not qualify'),
 (0.018637282541226458, 'school_setting_Suburban'),
 (0.01494616366510252, 'school_setting_Rural'),
 (0.01409706180534993, 'gender_Female'),
 (0.01381668707730171, 'gender_Male'),
 (0.013325066667971262, 'school_setting_Urban'),
 (0.010888099687444159, 'school_type_Public'),
 (0.009077638192297786, 'school_type_Non-public')]