# 1. Import library

In [None]:
# For mathematical operation
import pandas as pd
import numpy as np
# For draw graph
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# For time spend on the operation
import time
# For training models using sciket.learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm, preprocessing
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

# 2. Import data

In [None]:
# Import of training data
data_train=pd.read_csv('/media/mydata/PHD Opportunity/WORK/data_train.csv',header=None,sep=" ")
# Import of testing data
data_test=pd.read_csv('/media/mydata/PHD Opportunity/WORK/data_test.csv',header=None,sep=" ")
# Merge both above data: in fact, we transpose first the two data. Then, we concatenate the training and testing sets 
df=pd.concat([data_train.T,data_test.T],axis=1)
# We uniformly number the columns
df.columns=[i for i in range(58000)]
# We transpose again the data
data=df.T
#Data display
data

# 3. Data understanding

##  3.1 Data Visualization

### 3.1.1 Input visualization

In [None]:
######### Scatter plot of feature #############

# Dataset with only attributes
df1=data.drop(9,axis=1)
# Define ficgure and it size
plt.figure(figsize=(20,12))
# Define value for x axis
x=[i for i in range(58000)]
# Use "for" loop to plot all the nine (09) scatter plot
for i in range(9): 
# Define the dataframe for each feature
    df2=df1[i]
# For 9 plots, we use subplot with 3 lines and 3 columns
    plt.subplot(3,3,i+1)
# Generate scatter plot for each feature
    sns.scatterplot(x,df2)
# x label of each plot
    plt.xlabel('column'+str(i+1))

### 3.1.2 Output visualization

In [None]:
######### Barchart plot of label #############

# Define ficgure and it size
plt.figure(figsize=(20,7))
# For 3 plots, we use subplot with 1 line and 3 columns
plt.subplot(1,3,1)
# Barchart of dataset with only label for all dataset
sns.countplot(data[9])
plt.title("All dataset")
plt.xlabel("Label classes")
plt.subplot(1,3,2)
# Barchart of dataset with only label for training set
sns.countplot(data_train[9])
plt.title("Training set")
plt.xlabel("Label classes")
plt.subplot(1,3,3)
# Barchart of dataset with only label for testing set
sns.countplot(data_test[9])
plt.title("Testing set")
plt.xlabel("Label classes")

### 3.1.3 Desciptive statistic

In [None]:
df1.describe()

# 4. Design prediction methods using Logisic regression (LR), KNN, SVM 

## 4.1 Data preprocessing

### 4.1.1 Correlation matrix

In [None]:
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')

### 4.1.2 Shuffle of data

In [None]:
#### Shuffle of data 10 times and put in the dictionnary ####
dict={}
for i in range(10):
   data=data.sample(frac=1).reset_index(drop=True)
   dict["sample"+str(i)]=data

### 4.1.3 Normalization of each dataset 

In [None]:
####### We normalize all the dataframe and define the training data and testing data #######

# Create empty list
lst_xtr=[]  # For inputs training data
lst_xte=[]  # For inputs testing data
lst_ytr=[]  # For outputs training data
lst_yte=[]  # For outputs testing data
for i in range(10):
    data=dict["sample"+str(i)]
# Define the inputs data   
    X=data.drop(9,axis=1)
# Define the output data
    Y=data[9]
# Apply "test plit" for define the inputs training and testing sets, and output training and testing sets
    X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.25)
# Apply normalization method to inputs training and testing set
    scaler=StandardScaler()
    scaler.fit(X_train)
# New inputs training set
    X_train1=scaler.transform(X_train)
    scaler.fit(X_test)
# New inputs testing set
    X_test1=scaler.transform(X_test)
# Append all those dataset in the list respectively
    lst_xtr.append(X_train1)
    lst_xte.append(X_test1)
    lst_ytr.append(Y_train)
    lst_yte.append(Y_test)    

## 4.2 ML algorithms

### 4.2.1 Logistic Regression (LR)

In [None]:
# Create empty list
lst_acc_tr_lr=[]  # List for training accuracies
lst_acc_te_lr=[]  # List for testing accuracies
lst_time_tr_lr=[] # List for training time
lst_time_te_lr=[] # List for testing time
# Hyper parameter optimization using Grid search
parameters={'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']} # set of parameters
l = LogisticRegression(random_state = 40,multi_class="auto") # Define the LR algorithm
lr=GridSearchCV(l,param_grid=parameters) # Apply GridSearch
# Apply "for" loop for computation of all the accuracies and time
for i in range(10):
    start1 = time.time() # Start time for training
    lr.fit(lst_xtr[i], lst_ytr[i]) # Model training 
    Y_train_predict=lr.predict(lst_xtr[i]) # Training data prediction
    stop1 = time.time() # Stop time for training
    start2 = time.time() # Start time for testing
    Y_test_predict=lr.predict(lst_xte[i]) # Testing data prediction  
    stop2 = time.time() # Stop time for testing
# Complet the lists
    lst_acc_tr_lr.append(accuracy_score(lst_ytr[i],Y_train_predict))
    lst_acc_te_lr.append(accuracy_score(lst_yte[i],Y_test_predict))
    lst_time_tr_lr.append(stop1 - start1) # Training time
    lst_time_te_lr.append(stop2 - start2) # Test time

### 4.2.2 KNN

In [None]:
lst_acc_tr_knn=[]
lst_acc_te_knn=[]
lst_time_tr_knn=[]
lst_time_te_knn=[]
parameters={'n_jobs':[None,-1],'n_neighbors':[5,6,7],'p':[1,2]}
KNN0=KNeighborsClassifier(algorithm='auto', weights='distance') # Define the KNN algorithm
KNN=GridSearchCV(KNN0,param_grid=parameters)
for i in range(10):
    start1 = time.time()
    KNN.fit(lst_xtr[i], lst_ytr[i])
    Y_train_predict=KNN.predict(lst_xtr[i])
    stop1 = time.time()
    start2 = time.time()
    Y_test_predict=KNN.predict(lst_xte[i])
    stop2 = time.time()
    lst_acc_tr_knn.append(accuracy_score(lst_ytr[i],Y_train_predict))
    lst_acc_te_knn.append(accuracy_score(lst_yte[i],Y_test_predict))
    lst_time_tr_knn.append(stop1 - start1)
    lst_time_te_knn.append(stop2 - start2)

### 4.2.3 SVM

In [None]:
lst_acc_tr_svm=[]
lst_acc_te_svm=[]
lst_time_tr_svm=[]
lst_time_te_svm=[]
parameters={'C':[1,2,3,4],'gamma':["auto","scale"]}
svc0 = SVC(kernel='rbf')                             # Define the SVM algorithm
svc=GridSearchCV(svc0,param_grid=parameters)
for i in range(10):
    start1 = time.time()
    svc.fit(lst_xtr[i], lst_ytr[i])
    Y_train_predict=svc.predict(lst_xtr[i])
    stop1 = time.time()
    start2 = time.time()
    Y_test_predict=svc.predict(lst_xte[i])
    stop2 = time.time()
    lst_acc_tr_svm.append(accuracy_score(lst_ytr[i],Y_train_predict))
    lst_acc_te_svm.append(accuracy_score(lst_yte[i],Y_test_predict))
    lst_time_tr_svm.append(stop1 - start1)
    lst_time_te_svm.append(stop2 - start2)

# 5. Comparison of models

## 5.1 In term of accuracy

### 5.1.1 Mean accuracies

In [None]:
####### Print the mean accuracy of each model ########

print("Mean acuracy LR: ",np.mean(lst_acc_te_lr))
print("Mean acuracy KNN: ",np.mean(lst_acc_te_knn))
print("Mean acuracy SVM: ",np.mean(lst_acc_te_svm))

### 5.1.2 Box-plot of accuracies

In [None]:
########## Build the dataset to all the accuracies of each model ############

Scores=[lst_acc_te_lr,lst_acc_te_knn,lst_acc_te_svm]
Names=["LR","KNN","SVM"]
df1=pd.DataFrame(Scores)
df2=df1.T
df2.columns=Names
df2

In [None]:
########### Box-plot ############

Scores=[lst_acc_te_lr,lst_acc_te_knn,lst_acc_te_svm]
Names=["LR","KNN","SVM"]
df1=pd.DataFrame(Scores)
df2=df1.T
df2.columns=Names
plt.figure(figsize=(8,9))
plt.rc('xtick', labelsize=15)    # fontsize of the tick labels
plt.rc('ytick', labelsize=15)
sns.set_style("darkgrid")
sns.boxplot(data=df2, width=0.8,saturation=50,palette=["white","white","white"],medianprops={'color':'red'},showmeans=True)
plt.ylabel('Accuracy', fontsize=18)
plt.xlabel('Model', fontsize=18)
plt.show()

## 5.2 Training time

### 5.2.1 Mean training time

In [None]:
print("Mean Training time LR: ",np.mean(lst_time_tr_lr),"s")
print("Mean Training time KNN: ",np.mean(lst_time_tr_knn),"s")
print("Mean Training time SVM: ",np.mean(lst_time_tr_svm),"s")

### 5.2.2 Box-plot to training time

In [None]:
Scores=[lst_time_tr_lr,lst_time_tr_knn,lst_time_tr_svm]
Names_r=["LR","KNN","SVM"]
df1=pd.DataFrame(Scores)
df2=df1.T
df2.columns=Names_r
plt.figure(figsize=(8,7))
plt.rc('xtick', labelsize=15)    # fontsize of the tick labels
plt.rc('ytick', labelsize=15)
sns.set_style("darkgrid")
sns.boxplot(data=df2, width=0.8,saturation=50,palette=["white","white","white"],medianprops={'color':'red'},showmeans=True)
plt.ylabel('Training time (s)', fontsize=18)
plt.xlabel('Models', fontsize=18)

plt.show()

## 5.3 Testing time

### 5.3.1 Mean testing time

In [None]:
print("Mean Training time LR: ",np.mean(lst_time_te_lr),"s")
print("Mean Training time KNN: ",np.mean(lst_time_te_knn),"s")
print("Mean Training time SVM: ",np.mean(lst_time_te_svm),"s")

### 5.3.2 Box-plot to testing time

In [None]:
Scores=[lst_time_te_lr,lst_time_te_knn,lst_time_te_svm]
Names_r=["LR","KNN","SVM"]
df1=pd.DataFrame(Scores)
df2=df1.T
df2.columns=Names_r
plt.figure(figsize=(8,7))
plt.rc('xtick', labelsize=15)    # fontsize of the tick labels
plt.rc('ytick', labelsize=15)
sns.set_style("darkgrid")
sns.boxplot(data=df2, width=0.8,saturation=50,palette=["white","white","white"],medianprops={'color':'red'},showmeans=True)
plt.ylabel('Testing time (s)', fontsize=18)
plt.xlabel('Models', fontsize=18)

plt.show()

# 6. Result for resampling dataset

## 6.1 Import library

In [None]:
from imblearn.over_sampling import SMOTE


## 6.2 Define the method

In [None]:
method = SMOTE()

## 6.3 Import dataset

In [None]:
data_train=pd.read_csv('/media/mydata/PHD Opportunity/WORK/data_train.csv',header=None,sep=" ")
data_test=pd.read_csv('/media/mydata/PHD Opportunity/WORK/data_test.csv',header=None,sep=" ")
df=pd.concat([data_train.T,data_test.T],axis=1)
df.columns=[i for i in range(58000)]
df11=df.T
df11

## 6.4 Define the features set and the label set

In [None]:
X=df11.drop(9,axis=1)
Y=df11[9]

## 6.5 Apply the method to oversampling data

In [None]:
X1,Y1=method.fit_sample(X,Y)

## 6.6 Plot the features

In [None]:
sns.countplot(Y1)

## 6.7 Divide the data to training and testing sets

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X1,Y1,test_size=0.25)   
scaler=StandardScaler()
scaler.fit(X_train)
X_train1=scaler.transform(X_train)
scaler.fit(X_test)
X_test1=scaler.transform(X_test)

## 6.8 LR

In [None]:
parameters={'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
l = LogisticRegression(random_state = 40,multi_class="auto")
lr=GridSearchCV(l,param_grid=parameters)
start1 = time.time()
lr.fit(X_train1, Y_train)
Y_train_predict=lr.predict(X_train1)
stop1 = time.time()
start2 = time.time()
Y_test_predict=lr.predict(X_test1)
stop2 = time.time()
print('Training:')
print(accuracy_score(Y_train,Y_train_predict))
print('Test:')
print(accuracy_score(Y_test,Y_test_predict))
print(f"Training time: {stop1 - start1}s")
print(f"Test time: {stop2 - start2}s")
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_true=Y_test, y_pred=Y_test_predict)
print('Confusion matrix:\n', conf_mat)

## 6.9 KNN 

In [None]:
parameters={'n_jobs':[None,-1],'n_neighbors':[5,6,7],'p':[1,2]}
KNN0=KNeighborsClassifier(algorithm='auto', weights='distance')
KNN=GridSearchCV(KNN0,param_grid=parameters)
start1 = time.time()
KNN.fit(X_train1, Y_train)
Y_train_predict=KNN.predict(X_train1)
stop1 = time.time()
start2 = time.time()
Y_test_predict=KNN.predict(X_test1)
stop2 = time.time()
print('Training:')
print(accuracy_score(Y_train,Y_train_predict))
print('Test:')
print(accuracy_score(Y_test,Y_test_predict))
print(f"Training time: {stop1 - start1}s")
print(f"Test time: {stop2 - start2}s")
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_true=Y_test, y_pred=Y_test_predict)
print('Confusion matrix:\n', conf_mat)

## 6.10 SVM

In [None]:
parameters={'C':[1,2,3,4],'gamma':["auto","scale"]}
svc0 = SVC(kernel='rbf')
svc=GridSearchCV(svc0,param_grid=parameters)
start1 = time.time()
svc.fit(X_train1, Y_train)
Y_train_predict=svc.predict(X_train1)
stop1 = time.time()
start2 = time.time()
Y_test_predict=svc.predict(X_test1)
stop2 = time.time()
print('Training:')
print(accuracy_score(Y_train,Y_train_predict))
print('Test:')
print(accuracy_score(Y_test,Y_test_predict))
print(f"Training time: {stop1 - start1}s")
print(f"Test time: {stop2 - start2}s")
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_true=Y_test, y_pred=Y_test_predict)
print('Confusion matrix:\n', conf_mat)