In [24]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [25]:
df = pd.read_csv('test_scores.csv')
df.head()

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,student_id,gender,lunch,pretest,posttest
0,ANKYI,Urban,Non-public,6OL,Standard,20.0,2FHT3,Female,Does not qualify,62.0,72.0
1,ANKYI,Urban,Non-public,6OL,Standard,20.0,3JIVH,Female,Does not qualify,66.0,79.0
2,ANKYI,Urban,Non-public,6OL,Standard,20.0,3XOWE,Male,Does not qualify,64.0,76.0
3,ANKYI,Urban,Non-public,6OL,Standard,20.0,556O0,Female,Does not qualify,61.0,77.0
4,ANKYI,Urban,Non-public,6OL,Standard,20.0,74LOE,Male,Does not qualify,64.0,76.0


In [26]:
df = df.drop(columns = ["school", "classroom", "student_id"])
df.head()

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0


In [27]:
df["percent_change"] = ((df['posttest'] - df['pretest']) / df['posttest']) * 100
df.head(10)

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest,percent_change
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0,13.888889
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0,16.455696
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0,20.779221
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474
5,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,74.0,10.810811
6,Urban,Non-public,Standard,20.0,Male,Does not qualify,63.0,75.0,16.0
7,Urban,Non-public,Standard,20.0,Female,Does not qualify,63.0,72.0,12.5
8,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,77.0,16.883117
9,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,72.0,15.277778


In [28]:
# add a blank column
df["High_Low_Growth"] = ''
df.head()

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest,percent_change,High_Low_Growth
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0,13.888889,
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0,16.455696,
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0,20.779221,
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,


In [29]:
# apply a cut-off on percent change
def growth_rating(x):
    if x >= 15:
        result = "High Growth"
    else:
        result = "Low Growth"
    return result


df['High_Low_Growth'] = df['percent_change'].apply(growth_rating)
df.head()

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest,percent_change,High_Low_Growth
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0,13.888889,Low Growth
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0,16.455696,High Growth
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,High Growth
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0,20.779221,High Growth
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,High Growth


In [30]:
df.count()

school_setting     2133
school_type        2133
teaching_method    2133
n_student          2133
gender             2133
lunch              2133
pretest            2133
posttest           2133
percent_change     2133
High_Low_Growth    2133
dtype: int64

In [31]:
df['High_Low_Growth'].value_counts()

High Growth    1447
Low Growth      686
Name: High_Low_Growth, dtype: int64

In [32]:
X = df.drop(columns = ["High_Low_Growth", "percent_change", "posttest"])
X = pd.get_dummies(X)

X.head()

Unnamed: 0,n_student,pretest,school_setting_Rural,school_setting_Suburban,school_setting_Urban,school_type_Non-public,school_type_Public,teaching_method_Experimental,teaching_method_Standard,gender_Female,gender_Male,lunch_Does not qualify,lunch_Qualifies for reduced/free lunch
0,20.0,62.0,0,0,1,1,0,0,1,1,0,1,0
1,20.0,66.0,0,0,1,1,0,0,1,1,0,1,0
2,20.0,64.0,0,0,1,1,0,0,1,0,1,1,0
3,20.0,61.0,0,0,1,1,0,0,1,1,0,1,0
4,20.0,64.0,0,0,1,1,0,0,1,0,1,1,0


In [33]:
y = df.loc[:, "High_Low_Growth"].copy()

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

In [35]:
classifier = LogisticRegression(solver='lbfgs', random_state=1)

In [36]:
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [37]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
2111,High Growth,Low Growth
1194,Low Growth,Low Growth
1050,High Growth,High Growth
1997,High Growth,High Growth
1536,High Growth,High Growth
...,...,...
1135,Low Growth,Low Growth
1611,Low Growth,High Growth
1408,Low Growth,High Growth
1228,Low Growth,High Growth


In [38]:
# Calculated the balanced accuracy score
accuracy_score(y_test, predictions)

0.8164794007490637

In [39]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual High_Growth", "Actual Low_Growth"], columns=["Predicted High_Growth", "Predicted Low_Growth"])

cm_df

Unnamed: 0,Predicted High_Growth,Predicted Low_Growth
Actual High_Growth,322,40
Actual Low_Growth,58,114


In [40]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

High Growth       0.85      0.89      0.66      0.87      0.77      0.60       362
 Low Growth       0.74      0.66      0.89      0.70      0.77      0.58       172

avg / total       0.81      0.82      0.74      0.81      0.77      0.59       534



In [41]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

NameError: name 'rf_model' is not defined