In [None]:
#import modules
from EDA import Preprocessing
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from IPython.display import display
from imblearn.over_sampling import SMOTE

from sklearn.svm import SVC



In [None]:
#read datasets
import pandas as pd

df_mat = pd.read_csv('/home/pirate/Documents/machine_learning/score_prediction/student/student-mat.csv',sep=';')
df_por = pd.read_csv('/home/pirate/Documents/machine_learning/score_prediction/student/student-por.csv',sep=';')


In [None]:
#Handling duplicates found in datasets
df_merged=df_mat.append(df_por, ignore_index=True)
no_of_duplicates = df_merged.duplicated(["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"]).sum()
print(f'---> We have {no_of_duplicates} found in our two datasets\n')

print('---> Merging datasets and removing duplicates\n')
df_merged.drop_duplicates(subset=["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"],
                 keep = 'first', inplace = True)

print(f'---> After merging and removing duplicates we have {len(df_merged)} unique rows')


In [None]:
                    # PERFORMING EDA
preprocessor = Preprocessing(df_merged)
#check for missing values
df = preprocessor.check_missing_value()

#check dataset descriptives
preprocessor.descriptives(df)

#perform data exploration
##separating dataset into categorical and continuous columns
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)
continous_columns = numerical_columns_selector(df)
categorical_columns = categorical_columns_selector(df)

feature_list = list(df.columns)


In [None]:
colors = sns.color_palette('pastel')[0:5]

#performing counts of catgorical variable levels for some variables
# sex_count_table = pd.crosstab(index=df['sex'], columns='freq')
sns.countplot(x ='sex', data = df)
plt.show()

sns.boxplot(x ='guardian', y ='studytime', data = df, hue ='sex')
plt.show()

sns.stripplot(x ='school', y ='G3', data = df, jitter = True, hue ='sex', dodge = True)
plt.show()

# sns.pairplot(df, hue ="sex", palette ='coolwarm')
# plt.show()

g = sns.catplot(x='guardian',y='studytime', col = 'romantic', data=df,
                kind='bar', aspect=.6, palette='Set2')
(g.set_axis_labels("Guardian", "Study Time")
  .set_titles("relationship : {col_name}"))
plt.show()

sns.barplot(x='internet',y='G3',data=df, palette='rainbow', hue='sex')
plt.show()


In [None]:
#DATASET ENCODING OF CATEGORICAL VARIABLES
df_copy = df.copy() #copying the dataset to preserve original copy
ord_enc = OrdinalEncoder() #encoder object

#looping through all the categorical columns initailly found and applying encoding on it
print('Performing EnCoding Of Categorical variables')
for col in categorical_columns:
    df_copy[col] = ord_enc.fit_transform(df_copy[[col]])
df_copy.head(10)

In [None]:
                        #REGRESSION
#splitting datasets into dependent and independent variables
X = df_copy.drop('G3',axis=1)
Y = df_copy['G3']

#splitting X, Y into train and test (80:20) ratio
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2)

# Convert to numpy array
X = np.array(X)

#show summary of splitted datasets
print("shape of original dataset :", df_copy.shape)
print("shape of input - training set", x_train.shape)
print("shape of output - training set", y_train.shape)
print("shape of input - testing set", x_test.shape)
print("shape of output - testing set", y_test.shape)

In [None]:
                            # Random Forest Regression
regressor = RandomForestRegressor(n_estimators = 20, random_state = 0)

# fit the regressor with x and y data
regressor.fit(x_train, y_train)

# Use the forest's predict method on the test data
y_pred = regressor.predict(x_test)
# Metrics
errors = abs(y_pred - y_test)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Accuracy:', accuracy)

# # Get numerical feature importances
importances = list(regressor.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# # Print out the feature and importances and plot them
x, y = zip(*feature_importances)
plt.xticks(rotation=90)
plt.bar(x, y)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
                            #CLASSIFICATION
#DIVIDE GRADE G3 INTO CLASSES 
"""
'Low' :- greater than 0 upto 6
'Medium' :- greater than 6 upto 14
'High' :- greater than 14 upto 20
"""
df_copy['G3_Class'] = pd.cut(x=df_copy['G3'], bins=[0,6,14,20], labels=['Low', 'Medium', 'High'])
# df_copy.head(20)

#plotting for class imbalance
sns.countplot('G3_Class', data=df_copy)

#class count
count_class_medium, count_class_high, count_class_low = df_copy['G3_Class'].value_counts()

# Divide by class
class_low = df_copy[df_copy['G3_Class'] == 'Low']
class_medium = df_copy[df_copy['G3_Class'] == 'Medium']
class_high = df_copy[df_copy['G3_Class'] == 'High']


In [None]:
#Solving Class imbalance problem
                        ###using sampling method###

# Random Over-Sampling
class_low_over = class_low.sample(count_class_medium, replace=True)
df_low_over = pd.concat([class_medium, class_low_over], axis=0)
combined_dataset = df_low_over.append(class_high, ignore_index=True)
print('Performing Over sampling technique to increase low class to medium class size while maintaining high class value')
print(combined_dataset.G3_Class.value_counts(), '\n')

#class count
count_class_medium, count_class_low, count_class_high = combined_dataset['G3_Class'].value_counts()
# # Divide by class
class_low = combined_dataset[combined_dataset['G3_Class'] == 'Low']
class_medium = combined_dataset[combined_dataset['G3_Class'] == 'Medium']
class_high = combined_dataset[combined_dataset['G3_Class'] == 'High']

class_high_over = class_high.sample(count_class_medium, replace=True)
df_high_over = pd.concat([class_medium, class_high_over], axis=0)
dataset_final_sampling1 = df_high_over.append(class_low, ignore_index=True)
print('Performing Over sampling technique to increase low class to medium class size while maintaining high class value')
print(dataset_final_sampling1 .G3_Class.value_counts(),"\n")

#plotting dataset that has been sampled
dataset_final_sampling1.G3_Class.value_counts().plot(kind='bar', title='Count (target)');

x_train_new,x_test_new,y_train_new,y_test_new=train_test_split(X,Y,test_size=0.2)

In [163]:
#Encoding datasets and splitting datasets
#DATASET ENCODING OF CATEGORICAL VARIABLES
dataset_final_sampling1_copy = dataset_final_sampling1.copy() #copying the dataset to preserve original copy
ord_enc_new = OrdinalEncoder() #encoder object

#looping through all the categorical columns initailly found and applying encoding on it
print('Performing EnCoding Of Categorical variables')

dataset_final_sampling1_copy['G3_Class'] = ord_enc_new.fit_transform(dataset_final_sampling1_copy[['G3_Class']])

features = dataset_final_sampling1_copy.drop(['G3','G3_Class'],axis=1)
labels = dataset_final_sampling1_copy['G3_Class']

Performing EnCoding Of Categorical variables


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,G3_Class
0,0.0,0.0,15,1.0,1.0,1.0,1,1,0.0,2.0,...,3,2,2,3,3,10,7,8,10,2.0
1,0.0,0.0,16,1.0,0.0,1.0,3,3,2.0,2.0,...,3,2,1,2,5,4,6,10,10,2.0
2,0.0,1.0,16,1.0,1.0,1.0,2,2,2.0,2.0,...,4,4,1,1,3,0,12,12,11,2.0
3,0.0,0.0,15,1.0,0.0,1.0,4,4,4.0,1.0,...,3,3,1,2,2,0,10,8,9,2.0
4,0.0,0.0,15,1.0,0.0,1.0,2,1,3.0,2.0,...,2,2,1,1,4,4,10,12,12,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,0.0,0.0,17,1.0,0.0,1.0,1,1,0.0,2.0,...,3,3,1,1,3,4,5,5,6,1.0
1436,0.0,0.0,16,1.0,0.0,1.0,3,1,3.0,2.0,...,3,3,1,2,5,4,7,7,6,1.0
1437,1.0,0.0,17,0.0,0.0,1.0,1,2,2.0,2.0,...,5,5,1,3,1,14,6,5,5,1.0
1438,0.0,1.0,17,1.0,0.0,1.0,3,2,3.0,3.0,...,5,5,2,4,5,16,6,5,5,1.0


In [None]:
##Support Vector Machine
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(x_train, y_train)
  
prediction = svc_model.predict(x_test)
# check the accuracy on the training set
print(svc_model.score(x_train, y_train))
print(svc_model.score(x_test, y_test))
