## Data imports

In [39]:
import pandas as pd    
import matplotlib.pyplot as plt
from scipy import stats

df = pd.read_csv("data.csv", delimiter=";")
for column_name, column_dtype in df.dtypes.items():
    print(f"Column Name: {column_name}, Data Type: {column_dtype}")

Column Name: Marital status, Data Type: int64
Column Name: Application mode, Data Type: int64
Column Name: Application order, Data Type: int64
Column Name: Course, Data Type: int64
Column Name: Daytime/evening attendance	, Data Type: int64
Column Name: Previous qualification, Data Type: int64
Column Name: Previous qualification (grade), Data Type: float64
Column Name: Nacionality, Data Type: int64
Column Name: Mother's qualification, Data Type: int64
Column Name: Father's qualification, Data Type: int64
Column Name: Mother's occupation, Data Type: int64
Column Name: Father's occupation, Data Type: int64
Column Name: Admission grade, Data Type: float64
Column Name: Displaced, Data Type: int64
Column Name: Educational special needs, Data Type: int64
Column Name: Debtor, Data Type: int64
Column Name: Tuition fees up to date, Data Type: int64
Column Name: Gender, Data Type: int64
Column Name: Scholarship holder, Data Type: int64
Column Name: Age at enrollment, Data Type: int64
Column Name:

# Preprocessing

In [40]:
df = df.dropna()

mothers_qualification_dict = {
    1: "Secondary Education",
    2: "Higher Education",
    3: "Higher Education",
    4: "Higher Education",
    5: "Higher Education",
    6: "Unknown",
    9: "Did Not Finish High School",
    10: "Did Not Finish High School",
    11: "Did Not Finish High School",
    12: "Did Not Finish High School",
    14: "Did Not Finish High School",
    18: "General Commerce Course",
    19: "Did Not Finish High School",
    22: "Technical-Professional Course",
    26: "Did Not Finish High School",
    27: "Secondary School",
    29: "Did Not Finish High School",
    30: "Did Not Finish High School",
    34: "Unknown",
    35: "Illiterate",
    36: "Did Not Finish High School",
    37: "Did Not Finish High School",
    38: "Did Not Finish High School",
    39: "Technological Specialization Course",
    40: "Higher Education",
    41: "Specialized Higher Studies Course",
    42: "Professional Higher Technical Course",
    43: "Higher Education",
    44: "Higher Education"
}


df["Mother's qualification"] = df["Mother's qualification"].map(mothers_qualification_dict)
# Dictionary for Father's qualification
fathers_qualification_dict = {
    1: "Secondary Education",
    2: "Higher Education",
    3: "Higher Education",
    4: "Higher Education",
    5: "Higher Education",
    6: "Unknown",
    9: "Did Not Finish High School",
    10: "Did Not Finish High School",
    11: "Did Not Finish High School",
    12: "Did Not Finish High School",
    13: "Secondary School",
    14: "Did Not Finish High School",
    18: "General Commerce Course",
    19: "Did Not Finish High School",
    20: "Complementary High School Course",
    22: "Technical-Professional Course",
    25: "Did Not Finish High School",
    26: "Did Not Finish High School",
    27: "Secondary School",
    29: "Did Not Finish High School",
    30: "Did Not Finish High School",
    31: "General Course of Administration and Commerce",
    33: "Supplementary Accounting and Administration",
    34: "Unknown",
    35: "Illiterate",
    36: "Did Not Finish High School",
    37: "Did Not Finish High School",
    38: "Did Not Finish High School",
    39: "Technological Specialization Course",
    40: "Higher Education",
    41: "Specialized Higher Studies Course",
    42: "Professional Higher Technical Course",
    43: "Higher Education",
    44: "Higher Education"
}

df["Father's qualification"] = df["Father's qualification"].map(fathers_qualification_dict)

# Dictionary for Mother's occupation
mothers_occupation_dict = {
    0: "Student",
    1: "Professional and Executive Roles",
    2: "Professional and Executive Roles",
    3: "Technical and Administrative Roles",
    4: "Technical and Administrative Roles",
    5: "Service and Sales Roles",
    6: "Skilled Workers",
    7: "Skilled Workers",
    8: "Machine and Assembly Workers",
    9: "Unskilled Workers",
    10: "Armed Forces Professions",
    90: "Other",
    99: "Unknown",
    122: "Professional and Executive Roles",
    123: "Professional and Executive Roles",
    125: "Professional and Executive Roles",
    131: "Professional and Executive Roles",
    132: "Professional and Executive Roles",
    134: "Professional and Executive Roles",
    141: "Technical and Administrative Roles",
    143: "Technical and Administrative Roles",
    144: "Technical and Administrative Roles",
    151: "Service and Sales Roles",
    152: "Service and Sales Roles",
    153: "Service and Sales Roles",
    171: "Skilled Workers",
    173: "Skilled Workers",
    175: "Skilled Workers",
    191: "Cleaning workers",
    192: "Unskilled Workers",
    193: "Unskilled Workers",
    194: "Unskilled Workers"
}

df["Mother's occupation"] = df["Mother's occupation"].map(mothers_occupation_dict)

# Dictionary for Father's occupation
fathers_occupation_dict = {
    0: "Student",
    1: "Professional and Executive Roles",
    2: "Technical and Administrative Roles",
    3: "Technical and Administrative Roles",
    4: "Technical and Administrative Roles",
    5: "Service and Sales Roles",
    6: "Skilled Workers",
    7: "Skilled Workers",
    8: "Machine and Assembly Workers",
    9: "Unskilled Workers",
    10: "Armed Forces Professions",
    90: "Other",
    99: "Unknown",
    101: "Professional and Executive Roles",
    102: "Armed Forces Sergeants",
    103: "Other Armed Forces personnel",
    112: "Professional and Executive Roles",
    114: "Professional and Executive Roles",
    121: "Professional and Executive Roles",
    122: "Professional and Executive Roles",
    123: "Professional and Executive Roles",
    124: "Professional and Executive Roles",
    131: "Professional and Executive Roles",
    132: "Professional and Executive Roles",
    134: "Professional and Executive Roles",
    135: "Professional and Executive Roles",
    141: "Technical and Administrative Roles",
    143: "Technical and Administrative Roles",
    144: "Technical and Administrative Roles",
    151: "Service and Sales Roles",
    152: "Service and Sales Roles",
    153: "Service and Sales Roles",
    154: "Service and Sales Roles",
    161: "Skilled Workers",
    163: "Skilled Workers",
    171: "Skilled Workers",
    172: "Skilled Workers",
    174: "Skilled Workers",
    175: "Skilled Workers",
    181: "Machine and Assembly Workers",
    182: "Machine and Assembly Workers",
    183: "Machine and Assembly Workers",
    192: "Unskilled Workers",
    193: "Unskilled Workers",
    194: "Unskilled Workers",
    195: "Unskilled Workers"
}

df["Father's occupation"] = df["Father's occupation"].map(fathers_occupation_dict)

courses_dict = {
    33: "Biofuel Production Technologies",
    171: "Animation and Multimedia Design",
    8014: "Social Service (evening attendance)",
    9003: "Agronomy",
    9070: "Communication Design",
    9085: "Veterinary Nursing",
    9119: "Informatics Engineering",
    9130: "Equinculture",
    9147: "Management",
    9238: "Social Service",
    9254: "Tourism",
    9500: "Nursing",
    9556: "Oral Hygiene",
    9670: "Advertising and Marketing Management",
    9773: "Journalism and Communication",
    9853: "Basic Education",
    9991: "Management (evening attendance)"
}
df['Course'] = df['Course'].map(courses_dict)
marital_status_dict = {
    1: "single",
    2: "married",
    3: "widower",
    4: "divorced",
    5: "facto union",
    6: "legally separated"
}
df['Marital status'] = df['Marital status'].map(marital_status_dict)
nationality_dict = {
    1: "Portuguese",
    2: "German",
    6: "Spanish",
    11: "Italian",
    13: "Dutch",
    14: "English",
    17: "Lithuanian",
    21: "Angolan",
    22: "Cape Verdean",
    24: "Guinean",
    25: "Mozambican",
    26: "Santomean",
    32: "Turkish",
    41: "Brazilian",
    62: "Romanian",
    100: "Moldova (Republic of)",
    101: "Mexican",
    103: "Ukrainian",
    105: "Russian",
    108: "Cuban",
    109: "Colombian"
}
df['Nacionality'] = df['Nacionality'].map(nationality_dict)

# Dictionary for Application mode
applicationmode_dict = {
    1: 'General Contingent',
    2: 'Special Contingent',
    7: 'Holders of other higher courses',
    10: 'International Applicant',
    16: 'Special Contingent',
    17: 'General Contingent',
    18: 'General Contingent',
    26: 'Special Cases and Exceptions',
    27: 'Special Cases and Exceptions',
    39: 'Special Cases and Exceptions',
    42: 'Transfers and Changes',
    43: 'Transfers and Changes',
    44: 'Diploma Holders',
    51: 'Transfers and Changes',
    53: 'Diploma Holders',
    57: 'International Applicant'
}

# Replace the codes with strings
df['Application mode'] = df['Application mode'].map(applicationmode_dict)
df = df.dropna(subset=['Application mode'])

# Dictionary for Previous qualification
previousqualification_dict = {
    1: 'Secondary Education',
    2: "Higher Education",
    3: "Higher Education",
    4: "Higher Education",
    5: "Higher Education",
    6: 'Other',
    9: 'Secondary Education',
    10: 'Basic - third',
    12: 'Basic - third',
    14: 'Basic - third',
    15: 'Basic - third',
    19: 'Basic - third',
    38: 'Below Third',
    39: 'Tech Specialization',
    40: "Higher Education",
    42: 'Higher Education- tech',
    43: "Higher Education",
}

# Replace the codes with strings
df['Previous qualification'] = df['Previous qualification'].map(previousqualification_dict)

df = df.drop(['Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)'], axis=1)
# defining numerical and categorical variables
df_numerical = df[['Application order', 'Previous qualification (grade)', 'Admission grade', 'Age at enrollment','Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Unemployment rate', 'Inflation rate', 'GDP']]
numerical_l = df_numerical.columns.tolist()
df_categorical = df[['Marital status', 'Application mode', 'Tuition fees up to date', 'Course', 'Daytime/evening attendance	', 'Previous qualification', 'Nacionality', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", 'Displaced', 'Educational special needs', 'Debtor', 'Gender', 'Scholarship holder', 'International']]
categorical_l = df_categorical.columns.tolist()

# Separate values with Target = Enrolled
df_enrolled_test = df[df['Target'] == 'Enrolled']
# drop rows with enrollment status = 'Enrolled' from df
df = df.drop(df[df['Target'] == 'Enrolled'].index)

In [41]:
# Setting the target as 'Target' and dropping it from X
X = df.drop("Target", axis=1)
y = df['Target']

# same thing for enrolled
X_enrolled = df_enrolled_test.drop("Target", axis=1)
y_enrolled = df_enrolled_test['Target']

## Scaling the data

In [42]:
# scale numerical variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[numerical_l] = scaler.fit_transform(X[numerical_l])
X_scaled_num = X_scaled[numerical_l]
# Dummify only the categorical columns in df_categorical
X = pd.get_dummies(X, columns=categorical_l, drop_first=False)
X_scaled = pd.get_dummies(X_scaled, columns=categorical_l, drop_first= False)


df_scaled = df.copy()
df_scaled[numerical_l] = scaler.fit_transform(df[numerical_l])
df_scaled_num = df_scaled[numerical_l]
df_scaled = pd.get_dummies(df_scaled, columns=categorical_l, drop_first=False)

# same thing for enrolled
X_enrolled = pd.get_dummies(X_enrolled, columns=categorical_l, drop_first=False)

## Splitting the data

In [43]:
# Split the data into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=5)

# Regression with all available variables

In [49]:
# Import Logistic Regression from sklearn Logistic Regression Model for all Variables # LOGISTIC REGRESSION FOR ALL VARIABLES 
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression model instance
logistic_model = LogisticRegression()

# Fit the model with training data
logistic_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Display the Classification Report
print(classification_report(y_test, y_pred))

# Display the Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{conf_matrix}')
import pandas as pd
import numpy as np

# Assuming logistic_model is your trained Logistic Regression model
# and X_train contains your feature names

# Extract coefficients
coefficients = logistic_model.coef_[0]

# Create a DataFrame with feature names and coefficients
feature_names = X_train.columns
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Calculate the odds ratios (exponential of the coefficients)
coef_df['Odds Ratio'] = np.exp(coef_df['Coefficient'])

# Sort the features by the absolute value of their coefficients
# to identify the most significant predictors
coef_df = coef_df.reindex(coef_df['Coefficient'].abs().sort_values(ascending=False).index)

excel_path = "C:\\Users\\Admin\\Downloads\\logistic_regression_coefficients7.xlsx"
coef_df.to_excel(excel_path, index=False)

print(f'Results saved to {excel_path}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8772077375946173
              precision    recall  f1-score   support

     Dropout       0.90      0.78      0.84       477
    Graduate       0.86      0.95      0.90       712

    accuracy                           0.88      1189
   macro avg       0.88      0.86      0.87      1189
weighted avg       0.88      0.88      0.88      1189

Confusion Matrix: 
[[370 107]
 [ 39 673]]
Results saved to C:\Users\Admin\Downloads\logistic_regression_coefficients7.xlsx


# Regression with variable selection from XGBoost (No grades as features)

In [51]:
#####Code for Logistic Regression on Selected Columns XGBoost Selected With No Grades : 'Scholarship holder_1', 'Debtor_1', 'Course_Basic Education', 'Course_Nursing', "Mother's occupation_Student", "Mother's qualification_Unknown", 'Age at enrollment', 'Gender_1', 'Course_Social Service (evening attendance)' 
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Specify the columns to be used in the model
selected_columns =  ['Scholarship holder_1', 'Debtor_1', 'Course_Basic Education', 'Course_Nursing', "Mother's occupation_Student", "Mother's qualification_Unknown", 'Age at enrollment', 'Gender_1', 'Course_Social Service (evening attendance)']
# Selecting only the specified columns for X
X_selected = X[selected_columns]

# Split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.33, random_state=5)

# Create a Logistic Regression model instance
logistic_model = LogisticRegression()

# Fit the model with training data
logistic_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_model.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Display the Classification Report
print(classification_report(y_test, y_pred))

# Display the Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{conf_matrix}')
# Extract coefficients
coefficients = logistic_model.coef_[0]

# Create a DataFrame with feature names and coefficients
feature_names = X_train.columns
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Calculate the odds ratios (exponential of the coefficients)
coef_df['Odds Ratio'] = np.exp(coef_df['Coefficient'])

# Sort the features by the absolute value of their coefficients
# to identify the most significant predictors
coef_df = coef_df.reindex(coef_df['Coefficient'].abs().sort_values(ascending=False).index)

excel_path = "C:\\Users\\Admin\\Downloads\\logistic_regression_coefficients9.xlsx"
coef_df.to_excel(excel_path, index=False)

print(f'Results saved to {excel_path}')

Accuracy: 0.7392767031118587
              precision    recall  f1-score   support

     Dropout       0.72      0.57      0.64       477
    Graduate       0.75      0.86      0.80       712

    accuracy                           0.74      1189
   macro avg       0.74      0.71      0.72      1189
weighted avg       0.74      0.74      0.73      1189

Confusion Matrix: 
[[270 207]
 [103 609]]
Results saved to C:\Users\Admin\Downloads\logistic_regression_coefficients9.xlsx


# Regression with variable selection from RandomForest (No grades as features)

In [53]:
#####Code for Logistic Regression on Selected Columns RandomForest with NoGrades 
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Specify the columns to be used in the model
selected_columns = ['Age at enrollment', 'Admission grade', 'Previous qualification (grade)', 'Scholarship holder_1', 'Debtor_1', 'GDP', 'Unemployment rate', 'Inflation rate', 'Gender_1']
# Selecting only the specified columns for X
X_selected = X[selected_columns]

# Split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.33, random_state=5)

# Create a Logistic Regression model instance
logistic_model = LogisticRegression()

# Fit the model with training data
logistic_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_model.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Display the Classification Report
print(classification_report(y_test, y_pred))

# Display the Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{conf_matrix}')

# Extract coefficients
coefficients = logistic_model.coef_[0]

# Create a DataFrame with feature names and coefficients
feature_names = X_train.columns
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Calculate the odds ratios (exponential of the coefficients)
coef_df['Odds Ratio'] = np.exp(coef_df['Coefficient'])

# Sort the features by the absolute value of their coefficients
# to identify the most significant predictors
coef_df = coef_df.reindex(coef_df['Coefficient'].abs().sort_values(ascending=False).index)

excel_path = "C:\\Users\\Admin\\Downloads\\logistic_regression_coefficients10.xlsx"
coef_df.to_excel(excel_path, index=False)

print(f'Results saved to {excel_path}')

Accuracy: 0.7132043734230445
              precision    recall  f1-score   support

     Dropout       0.68      0.53      0.60       477
    Graduate       0.73      0.84      0.78       712

    accuracy                           0.71      1189
   macro avg       0.71      0.68      0.69      1189
weighted avg       0.71      0.71      0.71      1189

Confusion Matrix: 
[[253 224]
 [117 595]]
Results saved to C:\Users\Admin\Downloads\logistic_regression_coefficients10.xlsx


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Regression with variable selection from XGBoost (Including Grades)

In [56]:
#####Code for Logistic Regression on Selected Columns  xgboost features with Grades 
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Specify the columns to be used in the model
selected_columns = ['Curricular units 1st sem (grade)','Curricular units 1st sem (approved)', 'Tuition fees up to date_1', 'Curricular units 1st sem (enrolled)', 'Course_Basic Education', 'Course_Informatics Engineering', 'Scholarship holder_1', 'Nacionality_Portuguese', 'Debtor_1', 'Course_Social Service', 'Course_Journalism and Communication', "Mother's occupation_Machine and Assembly Workers", 'Course_Equinculture', 'Course_Management (evening attendance)']
# Selecting only the specified columns for X
X_selected = X[selected_columns]

# Split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.33, random_state=5)

# Create a Logistic Regression model instance
logistic_model = LogisticRegression()

# Fit the model with training data
logistic_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_model.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Display the Classification Report
print(classification_report(y_test, y_pred))

# Display the Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{conf_matrix}')

# Extract coefficients
coefficients = logistic_model.coef_[0]

# Create a DataFrame with feature names and coefficients
feature_names = X_train.columns
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Calculate the odds ratios (exponential of the coefficients)
coef_df['Odds Ratio'] = np.exp(coef_df['Coefficient'])

# Sort the features by the absolute value of their coefficients
# to identify the most significant predictors
coef_df = coef_df.reindex(coef_df['Coefficient'].abs().sort_values(ascending=False).index)

excel_path = "C:\\Users\\Admin\\Downloads\\logistic_regression_coefficients7.xlsx"
coef_df.to_excel(excel_path, index=False)

print(f'Results saved to {excel_path}')

Accuracy: 0.8713204373423045
              precision    recall  f1-score   support

     Dropout       0.88      0.79      0.83       477
    Graduate       0.87      0.93      0.90       712

    accuracy                           0.87      1189
   macro avg       0.87      0.86      0.86      1189
weighted avg       0.87      0.87      0.87      1189

Confusion Matrix: 
[[377 100]
 [ 53 659]]
Results saved to C:\Users\Admin\Downloads\logistic_regression_coefficients7.xlsx


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Regression with variable selection from RandomForest (Including Grades)

In [55]:
#####Code for Logistic Regression on Selected Columns  RandomForest features with Grades


#####Code for Logistic Regression on Selected Columns  xgboost features with Grades 
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Specify the columns to be used in the model
selected_columns = ['Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Tuition fees up to date_1', 'Age at enrollment', 'Curricular units 1st sem (evaluations)', 'Admission grade', 'Previous qualification (grade)', 'Curricular units 1st sem (enrolled)', 'Scholarship holder_1', 'Debtor_1', 'Unemployment rate', 'GDP', 'Inflation rate']
# Selecting only the specified columns for X
X_selected = X[selected_columns]

# Split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.33, random_state=5)

# Create a Logistic Regression model instance
logistic_model = LogisticRegression()

# Fit the model with training data
logistic_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_model.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Display the Classification Report
print(classification_report(y_test, y_pred))

# Display the Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{conf_matrix}')

# Extract coefficients
coefficients = logistic_model.coef_[0]

# Create a DataFrame with feature names and coefficients
feature_names = X_train.columns
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Calculate the odds ratios (exponential of the coefficients)
coef_df['Odds Ratio'] = np.exp(coef_df['Coefficient'])

# Sort the features by the absolute value of their coefficients
# to identify the most significant predictors
coef_df = coef_df.reindex(coef_df['Coefficient'].abs().sort_values(ascending=False).index)

excel_path = "C:\\Users\\Admin\\Downloads\\logistic_regression_coefficients7.xlsx"
coef_df.to_excel(excel_path, index=False)

print(f'Results saved to {excel_path}')


Accuracy: 0.8772077375946173
              precision    recall  f1-score   support

     Dropout       0.90      0.78      0.84       477
    Graduate       0.86      0.95      0.90       712

    accuracy                           0.88      1189
   macro avg       0.88      0.86      0.87      1189
weighted avg       0.88      0.88      0.88      1189

Confusion Matrix: 
[[370 107]
 [ 39 673]]
Results saved to C:\Users\Admin\Downloads\logistic_regression_coefficients7.xlsx


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
