In [63]:
import pandas as pd
import numpy as np
import copy as cp
import matplotlib.pyplot as plt

import seaborn as sns
from typing import Tuple
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn import decomposition
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [79]:
def cross_val_predict(model, k, X, y) -> Tuple[np.array, np.array, np.array]:

    kfold = StratifiedKFold(n_splits=k)
    model_ = cp.deepcopy(model)
    acc_score = []
    pre_score = []
    # created scaler
    scaler = StandardScaler()
    
    no_classes = len(np.unique(y))
    
    actual_classes = np.empty([0], dtype=int)
    predicted_classes = np.empty([0], dtype=int)
    predicted_proba = np.empty([0, no_classes]) 

    for train_ndx, test_ndx in kfold.split(X,y):

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)

        # Standardize the dataset                                             
        scaler = preprocessing.StandardScaler()
        # Fit on the train set only
        scaler.fit(train_X)
        # Apply to both the train set and the test set. 
        train_X = scaler.transform(train_X)
        test_X = scaler.transform(test_X)
      
        # Apply PCA
        pca = PCA()
        # Fit on the train set only
        pca.fit(train_X)
        # Apply transform to both the train set and the test set. 
        train_X = pca.transform(train_X)
        test_X = pca.transform(test_X)
        
    
        model_.fit(train_X, train_y)
        pred_values = model_.predict(test_X)
        acc = accuracy_score(pred_values , test_y)
        pre = precision_score(pred_values , test_y)
        acc_score.append(acc)
        pre_score.append(pre)
        predicted_classes = np.append(predicted_classes, pred_values)
        avg_acc_score = sum(acc_score)/k
        avg_pre_score = sum (pre_score)/k

        try:
            predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
        except:
            predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)

    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))
    print('precission of each fold - {}'.format(pre_score))
    print('Avg precission : {}'.format(avg_pre_score))

    return actual_classes, predicted_classes, predicted_proba

In [46]:
def plot_confusion_matrix(actual_classes : np.array, predicted_classes : np.array, sorted_labels : list):

    matrix = confusion_matrix(actual_classes, predicted_classes, labels=sorted_labels)
    
    plt.figure(figsize=(12.8,6))
    sns.heatmap(matrix, annot=True, xticklabels=sorted_labels, yticklabels=sorted_labels, cmap="Blues", fmt="g")
    plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix')
    plt.show()
  

In [48]:
dataframe = pd.read_csv('X_phrase.csv')
X = dataframe.iloc[:, 3:-1].values
y = dataframe.iloc [:, 6374]

k=10

In [None]:
model =  svm.SVC(kernel='rbf', C=100)
actual_classes, predicted_classes, _ = cross_val_predict(model, k, X, y)
plot_confusion_matrix(actual_classes, predicted_classes, [0, 1])

In [66]:

# evaluate pca with logistic regression algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# define the pipeline
steps = [('norm', StandardScaler()), ('pca', PCA(n_components=120), ('m', svm.SVC(kernel='rbf', C=100))]
model = Pipeline(steps=steps)
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.828 (0.027)


In [65]:
def get_models():
    models = dict()
    for i in range(1,21):
        steps =[('norm', StandardScaler()), ('pca', PCA(n_components=i)), ('m', svm.SVC(kernel='rbf', C=100))]
        models[str(i)] = Pipeline(steps=steps)
    return models

def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

dataframe = pd.read_csv('X_phrase.csv')
X = dataframe.iloc[:, 3:-1].values
y = dataframe.iloc [:, 6374]

models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>1 0.679 (0.003)
>2 0.718 (0.029)
>3 0.736 (0.032)
>4 0.736 (0.027)
>5 0.733 (0.030)
>6 0.730 (0.030)
>7 0.727 (0.028)
>8 0.732 (0.021)
>9 0.732 (0.025)
>10 0.727 (0.030)
>11 0.720 (0.028)
>12 0.732 (0.026)
>13 0.723 (0.024)
>14 0.723 (0.024)
>15 0.725 (0.021)
>16 0.737 (0.024)
>17 0.738 (0.027)
>18 0.740 (0.030)
>19 0.747 (0.027)
>20 0.746 (0.023)


NameError: name 'pyplot' is not defined

In [76]:
min(1978, 1)

1