In [15]:
import pandas as pd
import numpy as np
from pathlib import Path

In [16]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df_enrollment_data = pd.read_csv(
    "Resources/enrollment_data.csv", sep=';')

#Display all columns in dataframe
pd.set_option('display.max_columns', None)


#Rename columns
df_enrollment_data.rename(columns = {'Daytime/evening attendance\t': 'Attendance (Evening/Daytime)',
                          'Nacionality': 'Nationality',
                        'Age at enrollment': 'Age at time of enrollment',
                        'International': 'International Student'}, inplace = True)

#Change column characters
df_enrollment_data.columns = (
    df_enrollment_data.columns
    .str.replace('1st sem', '- S1')
    .str.replace('2nd sem', '- S2') 
)

# Review the DataFrame
df_enrollment_data

Unnamed: 0,Marital status,Application mode,Application order,Course,Attendance (Evening/Daytime),Previous qualification,Previous qualification (grade),Nationality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at time of enrollment,International Student,Curricular units - S1 (credited),Curricular units - S1 (enrolled),Curricular units - S1 (evaluations),Curricular units - S1 (approved),Curricular units - S1 (grade),Curricular units - S1 (without evaluations),Curricular units - S2 (credited),Curricular units - S2 (enrolled),Curricular units - S2 (evaluations),Curricular units - S2 (approved),Curricular units - S2 (grade),Curricular units - S2 (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,5,9,127.3,1,0,0,1,1,0,20,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,3,3,142.5,1,0,0,0,1,0,19,0,0,6,6,6,14.000000,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,9,9,124.8,1,0,0,0,1,0,19,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,5,3,119.6,1,0,0,1,0,0,20,0,0,6,8,6,13.428571,0,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,9,9,141.5,0,0,0,1,0,0,45,0,0,6,9,5,12.333333,0,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,5,4,122.2,0,0,0,1,1,0,19,0,0,6,7,5,13.600000,0,0,6,8,5,12.666667,0,15.5,2.8,-4.06,Graduate
4420,1,1,2,9773,1,1,120.0,105,1,1,9,9,119.0,1,0,1,0,0,0,18,1,0,6,6,6,12.000000,0,0,6,6,2,11.000000,0,11.1,0.6,2.02,Dropout
4421,1,1,1,9500,1,1,154.0,1,37,37,9,9,149.5,1,0,0,1,0,1,30,0,0,7,8,7,14.912500,0,0,8,9,1,13.500000,0,13.9,-0.3,0.79,Dropout
4422,1,1,1,9147,1,1,180.0,1,37,37,7,4,153.8,1,0,0,1,0,1,20,0,0,5,5,5,13.800000,0,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate


In [17]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = df_enrollment_data["Target"]

# Separate the X variable, the features
X = df_enrollment_data.drop(columns=["Target"])

# Review the y variable Series
y.head()

0     Dropout
1    Graduate
2     Dropout
3    Graduate
4    Graduate
Name: Target, dtype: object

In [18]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
# Display the shapes of the training and testing sets to confirm the split
print("Training Features shape (X_train):", X_train.shape)
print("Testing Features shape (X_test):", X_test.shape)
print("Training Labels shape (y_train):", y_train.shape)
print("Testing Labels shape (y_test):", y_test.shape)


Training Features shape (X_train): (3318, 36)
Testing Features shape (X_test): (1106, 36)
Training Labels shape (y_train): (3318,)
Testing Labels shape (y_test): (1106,)


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

# Create a StandardScaler instance
scaler = preprocessing.StandardScaler()

# Fit the scaler to the training data and transform both training and testing data
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 and increase max_iteration to 5000
logistic_regression_model = LogisticRegression(random_state=1, C=0.01, max_iter=5000)
logistic_regression_model


In [20]:
# Fit the model using the scaled training data
logistic_regression_model.fit(X_train_scaled, y_train)

In [21]:
# Get the feature names from the DataFrame
feature_names = X.columns

# Get the coefficients from the logistic regression model
coefficients = logistic_regression_model.coef_[0]

# Create a DataFrame to display feature names and their corresponding coefficients
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Score the model
print(f"Training Data Score: {logistic_regression_model.score(X_train, y_train)}")
print(f"Testing Data Score: {logistic_regression_model.score(X_test, y_test)}")

Training Data Score: 0.32127787823990356
Testing Data Score: 0.3209764918625678


In [22]:
predictions = logistic_regression_model.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
2803,Dropout,Graduate
1743,Dropout,Dropout
658,Dropout,Graduate
2683,Dropout,Graduate
3733,Dropout,Dropout
...,...,...
3688,Dropout,Graduate
374,Dropout,Dropout
1766,Dropout,Dropout
3114,Dropout,Graduate


In [23]:
# Now you can evaluate the model on the scaled test data
accuracy = logistic_regression_model.score(X_test_scaled, y_test)
print(f"Model accuracy: {accuracy:.2f}")

Model accuracy: 0.74
