In [26]:
from faker import Faker
import random
import pandas as pd

# Initialize Faker
fake = Faker()

# Define the number of students
num_students = 10000

# Create empty lists to store the data
names = []
grades = []
engagement = []
socio_economic_status = []
first_gen = []
interactions = []

# Generate synthetic data
for _ in range(num_students):
    names.append(fake.name())
    grades.append(random.randint(0, 100))
    engagement.append(random.choice(['Low', 'Medium', 'High']))
    socio_economic_status.append(random.choice(['Low', 'Medium', 'High']))
    first_gen.append(random.choice([True, False]))
    interactions.append(random.randint(0, 100))

# Create a DataFrame
data = pd.DataFrame({
    'Name': names,
    'Grade': grades,
    'Engagement': engagement,
    'Socio-Economic Status': socio_economic_status,
    'First Generation Student': first_gen,
    'LMS Interactions': interactions
})

# Save the DataFrame to a CSV file
data.to_csv('student_data.csv', index=False)

## Data Preparation  

In [97]:
# Read the data
data = pd.read_csv('student_data.csv')
data.head()

Unnamed: 0,Name,Grade,Engagement,Socio-Economic Status,First Generation Student,LMS Interactions
0,Susan Chandler,10,Low,High,False,0
1,Joyce Walker,14,Low,High,False,24
2,Anthony Lewis,75,High,Medium,True,33
3,Laura Gibson,92,High,High,True,94
4,Sean Lee,64,High,High,True,16


In [28]:
#Check data info and null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Name                      10000 non-null  object
 1   Grade                     10000 non-null  int64 
 2   Engagement                10000 non-null  object
 3   Socio-Economic Status     10000 non-null  object
 4   First Generation Student  10000 non-null  bool  
 5   LMS Interactions          10000 non-null  int64 
dtypes: bool(1), int64(2), object(3)
memory usage: 400.5+ KB


In [98]:
# Drop the unwanted columns [Name]
data.drop(columns=['Name'], inplace=True)
data.head()

Unnamed: 0,Grade,Engagement,Socio-Economic Status,First Generation Student,LMS Interactions
0,10,Low,High,False,0
1,14,Low,High,False,24
2,75,High,Medium,True,33
3,92,High,High,True,94
4,64,High,High,True,16


## Import the packages

In [53]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

In [99]:
# Handle Missing Values
for col in data.select_dtypes(include=[np.number]).columns:
    data[col].fillna(data[col].mean(), inplace=True)

for col in data.select_dtypes(include=[object, bool]).columns:
    data[col].fillna(data[col].mode()[0], inplace=True)

In [100]:
# Encode Categorical Variables
categorical_features = ['Engagement', 'Socio-Economic Status', 'First Generation Student']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

In [101]:
# Normalize the Data
numerical_features = ['Grade']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

In [91]:
# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [105]:
#Feature engineering
if 'Engagement' not in data.columns:
    data['Engagement'] = None

data['Grade_Engagement'] = data['Grade'] * data['Engagement'].map({'low': 1, 'medium': 2, 'high': 3})

# Define the features and target
X = data.drop(columns=['LMS Interactions'])
y = data['LMS Interactions'].apply(lambda x: 0 if x == 0 else 1)  # Target: 0 for dropout, 1 for active

In [106]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [107]:
X_train.columns

Index(['Grade', 'Engagement', 'Socio-Economic Status',
       'First Generation Student', 'Grade_Engagement'],
      dtype='object')

In [109]:
# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
])



In [114]:
# Pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [116]:
X_train

Unnamed: 0,Grade,Engagement,Socio-Economic Status,First Generation Student,Grade_Engagement
9254,55,Low,Low,False,
1561,60,Low,Medium,False,
1670,82,High,Medium,True,
6087,33,Medium,Low,False,
6669,29,Low,Low,False,
...,...,...,...,...,...
5734,64,Medium,High,False,
5191,88,Medium,Low,True,
5390,65,Medium,High,True,
860,58,Low,Low,False,


In [115]:
# Train the model
pipeline.fit(X_train, y_train)

In [117]:
# Predictions
y_pred = pipeline.predict(X_test)

In [118]:
# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.99
Precision: 0.99
Recall: 1.00
F1 Score: 0.99


In [119]:
# Cross-Validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(f'Cross-Validation Accuracy: {cv_scores.mean():.2f}')

Cross-Validation Accuracy: 0.99


In [120]:
# Feature Importance
pipeline.named_steps['classifier'].feature_importances_

array([0.87751632, 0.016303  , 0.02164067, 0.01610817, 0.01307964,
       0.01200125, 0.01300013, 0.01355644, 0.01679439])

In [121]:
# Save the model
joblib.dump(pipeline, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [122]:
# Load the model
model = joblib.load('random_forest_model.pkl')