In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,log_loss
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV,StratifiedKFold,KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
import warnings as ws
ws.filterwarnings("ignore")

In [2]:
train=pd.read_csv("train.csv")
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
X=train.drop("label",axis=1)
y=train['label']

### LOGISTIC REGRESSION

In [4]:
lr=LogisticRegression()

In [5]:
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=23)

In [6]:
params={'penalty':['l1','l2',None],
       'multi_class':['multinomial']}

In [7]:
gcv=GridSearchCV(lr,param_grid=params,cv=kfold,scoring='neg_log_loss')

In [8]:
gcv.fit(X,y)

In [9]:
print("best parameter :",gcv.best_params_)
print("best score :",gcv.best_score_)

best parameter : {'multi_class': 'multinomial', 'penalty': None}
best score : -0.31932675299122754


In [17]:
gcv=GridSearchCV(lr,param_grid=params,cv=kfold,scoring='accuracy')

In [18]:
gcv.fit(X,y)

In [19]:
print("best parameter :",gcv.best_params_)
print("best score :",gcv.best_score_)

best parameter : {'multi_class': 'multinomial', 'penalty': None}
best score : 0.915595238095238


In [10]:
bm_logistic=gcv.best_estimator_

In [11]:
test=pd.read_csv('test.csv')
test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
test.shape

(28000, 784)

In [13]:
y_pred=bm_logistic.predict(test)

In [14]:
submit_lg = pd.DataFrame({'ImageId':np.arange(1,28001),'label':y_pred})

In [15]:
submit_lg.to_csv("Submit.csv",index=False)

### KNN

In [38]:
knn=KNeighborsClassifier()

In [39]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import Pipeline

In [40]:
scaler=StandardScaler()
knn=KNeighborsClassifier()
pipe=Pipeline([('SCL',scaler),('KNN',knn)])

In [41]:
params={'KNN__n_neighbors':np.arange(1,6),'SCL':[StandardScaler(),MinMaxScaler()]}

In [42]:
gcv=GridSearchCV(pipe,param_grid=params,cv=kfold,scoring='neg_log_loss',n_jobs=2)
gcv.fit(X,y)

In [43]:
print("best parameter :",gcv.best_params_)
print("best score :",gcv.best_score_)

best parameter : {'KNN__n_neighbors': 5, 'SCL': MinMaxScaler()}
best score : -0.38688440588268735


In [44]:
bm_knn=gcv.best_estimator_

In [54]:
y_pred=bm_knn.predict(test)

In [55]:
submit_knn = pd.DataFrame({'ImageId':np.arange(1,28001),'label':y_pred})

In [57]:
submit_knn.to_csv("Submit(knn).csv",index=False)

### Gaussian NB

In [50]:
nb=GaussianNB()

In [51]:
params={'var_smoothing':np.linspace(0,1,20)}

In [52]:
gcv=GridSearchCV(nb,param_grid=params,cv=kfold,scoring='neg_log_loss',n_jobs=2,verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
Traceback (most recent call last):
  File "/home/dai/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dai/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dai/anaconda3/lib/python3

  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
Traceback (most recent call last):
  File "/home/dai/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dai/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dai/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 401, in _score
    return self._sign * self._score_func(y, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/ho





In [58]:
print("best parameter :",gcv.best_params_)
print("best score :",gcv.best_score_)

best parameter : {'var_smoothing': 1.0}
best score : -4.078528514704209


In [59]:
bm_gnb=gcv.best_estimator_

In [60]:
y_pred=bm_gnb.predict(test)

In [61]:
submit_gnb = pd.DataFrame({'ImageId':np.arange(1,28001),'label':y_pred})

In [62]:
submit_gnb.to_csv("Submit(gnb).csv",index=False)

![Screenshot%20from%202023-11-04%2015-29-48.png](attachment:Screenshot%20from%202023-11-04%2015-29-48.png)

since the accuracy for knn is greater, we conclude that the knn model is better