In [1]:
from sqlalchemy import create_engine
import psycopg2

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from config import db_password

In [2]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/test_scores"

In [3]:
engine = create_engine(db_string)

In [4]:
df = pd.read_sql_query('''SELECT * FROM combined_table_info''',engine)
df.head()

Unnamed: 0,student_id,school,school_setting,school_type,classroom,teaching_method,n_student,gender,lunch,pretest,posttest
0,2FHT3,ANKYI,Urban,Non-public,6OL,Standard,20.0,Female,Does not qualify,62.0,72.0
1,3JIVH,ANKYI,Urban,Non-public,6OL,Standard,20.0,Female,Does not qualify,66.0,79.0
2,3XOWE,ANKYI,Urban,Non-public,6OL,Standard,20.0,Male,Does not qualify,64.0,76.0
3,556O0,ANKYI,Urban,Non-public,6OL,Standard,20.0,Female,Does not qualify,61.0,77.0
4,74LOE,ANKYI,Urban,Non-public,6OL,Standard,20.0,Male,Does not qualify,64.0,76.0


In [5]:
df = df.drop(columns = ["school", "classroom", "student_id"])
df.head()

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0


In [6]:
df["percent_change"] = ((df['posttest'] - df['pretest']) / df['posttest']) * 100
df.head(5)

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest,percent_change
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0,13.888889
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0,16.455696
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0,20.779221
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474


In [7]:

df["High_Low_Growth"] = ''

def growth_rating(x):
    if x >= 18.5:
        result = "High Growth"
    else:
        result = "Low Growth"
    return result


df['High_Low_Growth'] = df['percent_change'].apply(growth_rating)
df.head()

Unnamed: 0,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest,percent_change,High_Low_Growth
0,Urban,Non-public,Standard,20.0,Female,Does not qualify,62.0,72.0,13.888889,Low Growth
1,Urban,Non-public,Standard,20.0,Female,Does not qualify,66.0,79.0,16.455696,Low Growth
2,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,Low Growth
3,Urban,Non-public,Standard,20.0,Female,Does not qualify,61.0,77.0,20.779221,High Growth
4,Urban,Non-public,Standard,20.0,Male,Does not qualify,64.0,76.0,15.789474,Low Growth


In [8]:
df.count()

school_setting     2133
school_type        2133
teaching_method    2133
n_student          2133
gender             2133
lunch              2133
pretest            2133
posttest           2133
percent_change     2133
High_Low_Growth    2133
dtype: int64

In [9]:
df['High_Low_Growth'].value_counts()

Low Growth     1128
High Growth    1005
Name: High_Low_Growth, dtype: int64

In [10]:
X = df.drop(columns = ["High_Low_Growth", "percent_change", "posttest", "pretest"])
X = pd.get_dummies(X)

X.head()

Unnamed: 0,n_student,school_setting_Rural,school_setting_Suburban,school_setting_Urban,school_type_Non-public,school_type_Public,teaching_method_Experimental,teaching_method_Standard,gender_Female,gender_Male,lunch_Does not qualify,lunch_Qualifies for reduced/free lunch
0,20.0,0,0,1,1,0,0,1,1,0,1,0
1,20.0,0,0,1,1,0,0,1,1,0,1,0
2,20.0,0,0,1,1,0,0,1,0,1,1,0
3,20.0,0,0,1,1,0,0,1,1,0,1,0
4,20.0,0,0,1,1,0,0,1,0,1,1,0


In [11]:
y = df.loc[:, "High_Low_Growth"].copy()

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1 
                                                    )

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Resample the training data with the BalancedRandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1) 

# fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# make prediction
predictions = rf_model.predict(X_test_scaled)

In [14]:
from sklearn.metrics import balanced_accuracy_score
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.7380343082114735

In [15]:
from sklearn.metrics import confusion_matrix

# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual High_Growth", "Actual Low_Growth"], columns=["Predicted High_Growth", "Predicted Low_Growth"])

cm_df

Unnamed: 0,Predicted High_Growth,Predicted Low_Growth
Actual High_Growth,169,85
Actual Low_Growth,53,227


In [16]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

High Growth       0.76      0.67      0.81      0.71      0.73      0.53       254
 Low Growth       0.73      0.81      0.67      0.77      0.73      0.55       280

avg / total       0.74      0.74      0.73      0.74      0.73      0.54       534



In [17]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3612482841628457, 'n_student'),
 (0.15865452634885038, 'teaching_method_Standard'),
 (0.12362713142566739, 'teaching_method_Experimental'),
 (0.09620270760409341, 'lunch_Qualifies for reduced/free lunch'),
 (0.08556365920300184, 'lunch_Does not qualify'),
 (0.04189407409258582, 'school_setting_Suburban'),
 (0.02787658496287215, 'school_setting_Urban'),
 (0.02608846192019951, 'school_setting_Rural'),
 (0.021777822783547295, 'school_type_Non-public'),
 (0.020136094747501047, 'gender_Male'),
 (0.019123536889103575, 'gender_Female'),
 (0.017807115859731736, 'school_type_Public')]