## Machine learning with Scikit-learn: Classification on this [dataset](https://github.com/hhaji/Applied-Machine-Learning/blob/master/Recitation-Assignments/Assignments_Data/Assignment_Set_8_Data.csv)
by [Zahra Taheri](https://github.com/zahta) (6 May 2020)

## Multiclass classification

In [1]:
import numpy as np
import pandas as pd

In [2]:
url='https://raw.githubusercontent.com/hhaji/Applied-Machine-Learning/master/Recitation-Assignments/Assignments_Data/Assignment_Set_8_Data.csv'
names = ['gender','race/ethnicity','parental level of education','lunch','test preparation course','math score',
         'reading score','writing score']
data = pd.read_csv(url)
data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [3]:
#Preparing the data
data['test preparation course']=data['test preparation course'].map({'none': 1,'completed': 2})
data['lunch']=data['lunch'].map({'free/reduced': 1,'standard': 2})
data['parental level of education']=data['parental level of education'].map(
    {'some high school':1,'high school':2,'some college':3,"bachelor's degree":4,
     "master's degree":5,"associate's degree":6})
data['race/ethnicity']=data['race/ethnicity'].map(
    {'group A':1,'group B':2,'group C':3,'group D':4,'group E':5})
data['gender']=data['gender'].map({'male': 1,'female': 2})

In [4]:
corr_matrix = data.corr()

corr_matrix["race/ethnicity"].sort_values(ascending=False)

race/ethnicity                 1.000000
math score                     0.216415
writing score                  0.165691
reading score                  0.145253
parental level of education    0.090930
lunch                          0.046563
test preparation course        0.017508
gender                         0.001502
Name: race/ethnicity, dtype: float64

In [5]:
dataset=data.drop(data.columns[1],axis=1)
X = dataset.iloc[:, :].values
y = data.iloc[:, 1].values

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Full Dimension

#### Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter=3000,random_state=0, solver='lbfgs', multi_class='multinomial')

LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)

In [9]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy= ' + str(accuracy_score(y_test, y_pred)))

[[ 0  0 14  3  0]
 [ 0  1 23  7  5]
 [ 0  0 43 16  6]
 [ 0  0 23 31  3]
 [ 0  0 12  5  8]]
Accuracy= 0.415


####  (SVM) Support Vector Machines

In [10]:
from sklearn import svm

SVM = svm.SVC(decision_function_shape="ovo")

SVM.fit(X_train, y_train)
y_pred = SVM.predict(X_test)

In [11]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy= ' + str(accuracy_score(y_test, y_pred)))

[[ 0  0 14  3  0]
 [ 0  0 28  6  2]
 [ 0  2 55  7  1]
 [ 0  0 42 15  0]
 [ 0  0 15  7  3]]
Accuracy= 0.365


#### Random Forests

In [12]:
from sklearn.ensemble import RandomForestClassifier

RF=RandomForestClassifier(max_depth=10, random_state=0)

RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)

In [13]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy= ' + str(accuracy_score(y_test, y_pred)))

[[ 0  1 11  3  2]
 [ 1  3 15 15  2]
 [ 1  5 39 15  5]
 [ 0  7 29 18  3]
 [ 0  0 15  3  7]]
Accuracy= 0.335


#### Neural Networks

In [14]:
from sklearn.neural_network import MLPClassifier

NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(150, 10), random_state=0,max_iter=10000)

NN.fit(X_train, y_train)
y_pred = NN.predict(X_test)

In [15]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy= ' + str(accuracy_score(y_test, y_pred)))

[[ 0  3  5  6  3]
 [ 3  3 10 16  4]
 [ 7 11 20 19  8]
 [ 4  4 21 19  9]
 [ 2  3  7  6  7]]
Accuracy= 0.245


#  Linear discriminant analysis (LDA)

In [16]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=4)
X_train1 = lda.fit_transform(X_train, y_train)
X_test1 = lda.transform(X_test)

#### Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter=3000,random_state=0, solver='lbfgs', multi_class='multinomial')

LR.fit(X_train1, y_train)
y_pred = LR.predict(X_test1)

In [18]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy= ' + str(accuracy_score(y_test, y_pred)))

[[ 0  0 14  3  0]
 [ 0  1 23  7  5]
 [ 0  0 39 17  9]
 [ 0  0 22 32  3]
 [ 0  0 12  5  8]]
Accuracy= 0.4


####  (SVM) Support Vector Machines

In [19]:
from sklearn import svm

SVM = svm.SVC(decision_function_shape="ovo")

SVM.fit(X_train1, y_train)
y_pred = SVM.predict(X_test1)

In [20]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy= ' + str(accuracy_score(y_test, y_pred)))

[[ 0  1 12  3  1]
 [ 0  2 24  6  4]
 [ 0  1 46 10  8]
 [ 0  0 29 24  4]
 [ 0  0 14  4  7]]
Accuracy= 0.395


#### Random Forests

In [21]:
from sklearn.ensemble import RandomForestClassifier

RF=RandomForestClassifier(max_depth=10, random_state=0)

RF.fit(X_train1, y_train)
y_pred = RF.predict(X_test1)

In [22]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy= ' + str(accuracy_score(y_test, y_pred)))

[[ 0  4  9  2  2]
 [ 0  5 17  9  5]
 [ 0  5 35 17  8]
 [ 0  3 23 24  7]
 [ 0  2  9  5  9]]
Accuracy= 0.365


#### Neural Networks

In [23]:
from sklearn.neural_network import MLPClassifier

NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(150, 10), random_state=0,max_iter=7000)

NN.fit(X_train1, y_train)
y_pred = NN.predict(X_test1)

In [24]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy= ' + str(accuracy_score(y_test, y_pred)))

[[ 0  5  7  3  2]
 [ 1  7 13  9  6]
 [ 4 12 20 18 11]
 [ 7  7 17 19  7]
 [ 4  6  6  5  4]]
Accuracy= 0.25


#  Principal Component Analysis (PCA)

In [25]:
from sklearn.decomposition import PCA

pca = PCA()
X_train2 = pca.fit_transform(X_train)
X_test2 = pca.transform(X_test)

In [26]:
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.43158409, 0.1632094 , 0.14770439, 0.1371726 , 0.10295223,
       0.01199538, 0.00538191])

In [27]:
pca = PCA(n_components=5)
X_train2 = pca.fit_transform(X_train)
X_test2 = pca.transform(X_test)

#### Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter=3000,random_state=0, solver='lbfgs', multi_class='multinomial')

LR.fit(X_train2, y_train)
y_pred = LR.predict(X_test2)

In [29]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy= ' + str(accuracy_score(y_test, y_pred)))

[[ 0  0 14  3  0]
 [ 0  0 27  9  0]
 [ 0  0 50 13  2]
 [ 0  0 34 23  0]
 [ 0  0 15 10  0]]
Accuracy= 0.365


####  (SVM) Support Vector Machines

In [30]:
from sklearn import svm

SVM = svm.SVC(decision_function_shape="ovo")

SVM.fit(X_train2, y_train)
y_pred = SVM.predict(X_test2)

In [31]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy= ' + str(accuracy_score(y_test, y_pred)))

[[ 0  0 14  3  0]
 [ 0  1 30  5  0]
 [ 0  1 53 11  0]
 [ 0  0 45 12  0]
 [ 0  0 17  8  0]]
Accuracy= 0.33


#### Random Forests

In [32]:
from sklearn.ensemble import RandomForestClassifier

RF=RandomForestClassifier(max_depth=10, random_state=0)

RF.fit(X_train2, y_train)
y_pred = RF.predict(X_test2)

In [33]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy= ' + str(accuracy_score(y_test, y_pred)))

[[ 0  0  7  9  1]
 [ 1  3 16 13  3]
 [ 2 12 34 15  2]
 [ 2  3 38 10  4]
 [ 0  1 12  8  4]]
Accuracy= 0.255


#### Neural Networks

In [34]:
from sklearn.neural_network import MLPClassifier

NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(150, 10), random_state=0,max_iter=12000)

NN.fit(X_train2, y_train)
y_pred = NN.predict(X_test2)

In [35]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy= ' + str(accuracy_score(y_test, y_pred)))

[[ 1  4  5  5  2]
 [ 2 11  7 12  4]
 [ 4 16 23 14  8]
 [ 2  8 24 17  6]
 [ 2  3  8  7  5]]
Accuracy= 0.285
