# Select the classification model

In [101]:
import pandas as pd

In [102]:
df = pd.read_csv(".csv/03_datapoints.csv")
df.shape

(209999, 129)

In [103]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns


In [104]:
X = df.drop("label", axis=1)
X.head()

Unnamed: 0,dp_0,dp_1,dp_2,dp_3,dp_4,dp_5,dp_6,dp_7,dp_8,dp_9,...,dp_118,dp_119,dp_120,dp_121,dp_122,dp_123,dp_124,dp_125,dp_126,dp_127
0,-0.851101,-0.133359,0.411688,-0.506884,-0.62475,-1.183555,-0.007701,-0.193852,-2.310815,-0.681502,...,-0.398452,-0.171966,-0.198043,-0.392601,-0.921793,-0.181319,0.049851,-0.743607,0.221477,0.06114
1,-0.942043,-0.028752,0.842339,-0.91713,-1.013945,-1.175337,-0.031011,-0.134986,-2.647296,-0.773516,...,-0.482067,-0.491768,-0.181433,-0.631909,-1.186748,-0.269359,-0.129906,-0.844575,0.131751,-0.067637
2,-0.965285,-0.201948,0.425197,-0.839285,-0.732815,-1.359382,0.186973,-0.21492,-2.731281,-0.73837,...,-0.454871,-0.064863,-0.139382,-0.451683,-1.054647,-0.422945,-0.04687,-0.828963,0.277579,-0.099526
3,-1.257392,-0.270322,0.366864,-0.904969,-0.922917,-1.498431,0.406486,-0.042352,-2.874939,-0.704583,...,-0.600211,-0.132417,-0.038785,-0.382238,-1.255367,-0.557412,-0.068375,-0.844525,0.311174,0.154526
4,-1.411341,-0.296022,0.336932,-0.924695,-0.962841,-1.601036,0.384875,-0.075125,-2.976923,-0.78572,...,-0.664096,-0.108609,-0.054045,-0.364247,-1.421688,-0.612453,-0.09307,-0.908785,0.356414,0.239415


In [105]:
y = df["label"]
y.shape

(209999,)

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape,

((157499, 128), (52500, 128), (157499,), (52500,))

In [111]:
def metrics(model, X=X_test, y=y_test, average="macro"):
    metrics = {}
    functions = {'accuracy': accuracy_score, 
                 'precision': lambda *args: precision_score(*args,average=average, zero_division=1), 
                 'recall': lambda *args: recall_score(*args,average=average), 
                 'f_score': lambda *args: f1_score(*args,average=average)}
    model.fit(X_train, y_train)
    y_pred = model.predict(X)
    for n,m in functions.items():
        metrics[n] = m(y,y_pred)
    return metrics

In [112]:
metrics(SGDClassifier(random_state=42))

{'accuracy': 0.04180952380952381,
 'precision': 0.4692605728244183,
 'recall': 0.042262610515370676,
 'f_score': 0.02138734537404831}

In [113]:
metrics(RandomForestClassifier(random_state=42))

{'accuracy': 0.09215238095238096,
 'precision': 0.08408570016058782,
 'recall': 0.09178982868331663,
 'f_score': 0.08595148393830342}

In [115]:
metrics(KNeighborsClassifier())

{'accuracy': 0.06405714285714285,
 'precision': 0.06282985688816108,
 'recall': 0.06367038382678798,
 'f_score': 0.056055398541393964}