In [159]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, log_loss
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.neighbors import KNeighborsClassifier

## HR dataset
- as an example of supervised learning

In [26]:
df = pd.read_csv('Cases/human-resources-analytics/HR_comma_sep.csv')

In [28]:
X = df.drop('left',axis =1)
y= df['left']
X_train, X_test, y_train, y_test = train_test_split(X,y , random_state =24, test_size=0.3, stratify = y)
ohe = OneHotEncoder(sparse_output= False, drop = 'first').set_output(transform = 'pandas')
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude = object)), (ohe , make_column_selector(dtype_include= object)), verbose_feature_names_out = False)

In [32]:
lda = LinearDiscriminantAnalysis()
pipe = Pipeline([('CT', ct),('DA',lda)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("Acuuracy score : ", accuracy_score(y_test, y_pred))

Acuuracy score :  0.7695043342965103


In [34]:
qda = QuadraticDiscriminantAnalysis()
pipe = Pipeline([('CT', ct),('DA',qda)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("Acuuracy score : ", accuracy_score(y_test, y_pred))

Acuuracy score :  0.863080684596577


## Vehicle Silhouette 
- as an example of unsupervised learning (dimensionality reduction)

In [46]:
df1 = pd.read_csv("Cases/Vehicle Silhouettes/Vehicle.csv")

In [48]:
df1.head()

Unnamed: 0,Comp,Circ,D.Circ,Rad.Ra,Pr.Axis.Ra,Max.L.Ra,Scat.Ra,Elong,Pr.Axis.Rect,Max.L.Rect,Sc.Var.Maxis,Sc.Var.maxis,Ra.Gyr,Skew.Maxis,Skew.maxis,Kurt.maxis,Kurt.Maxis,Holl.Ra,Class
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,van
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,van
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,saab
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,van
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,bus


In [60]:
y = df1.Class
X = df1.drop("Class", axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X,y , random_state =24, test_size=0.3, stratify = y)
lda = LinearDiscriminantAnalysis().set_output(transform = 'pandas')
lda.fit(X_train, y_train)

In [83]:
X_train_lda = lda.transform(X_train)
print('New Shape: ',X_train_lda.shape)  # number of components (<= min(n_classes-1,n_features)) for dimensionality reduction. Here we have n_classes = 4.
print('Original Shape: ',X_train.shape)

New Shape:  (592, 3)
Original Shape:  (592, 18)


In [87]:
lr = LogisticRegression()
lr.fit(X_train_lda, y_train)

### By Using lda and Logistic Reegression

In [91]:
X_test_lda = lda.transform(X_test)
y_pred = lr.predict(X_test_lda)
print(accuracy_score(y_test,y_pred))

0.7992125984251969


In [110]:
pipe = Pipeline([('DA',lda)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred))

Accuracy Score:  0.8031496062992126


In [112]:
# Using Pipe
pipe = Pipeline([('DA',lda),('LR',lr)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred))

Accuracy Score:  0.7992125984251969


### By using only Logistic Regression

In [106]:
lr= LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('Accuracy Score by only Logistic Regression: ',accuracy_score(y_test,y_pred))

Accuracy Score by only Logistic Regression:  0.7559055118110236


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Using Quadratic Determinant Ananlysis

In [120]:
pipe = Pipeline([('QDA',qda)]) # qda cant be applied on .transform . It can be fitted in pipeline in last as we are doing here, and not during regression.
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred))

Accuracy Score:  0.8543307086614174


## Wine Dataset

In [125]:
df = pd.read_csv('Datasets/wine.csv')

In [129]:
df.head()

Unnamed: 0,Class,Alcohol,Malic,Ash,Alcalinity,Magnesium,Phenols,Flavanoids,Nonflavanoid,Proanthocyanins,Intensity,Hue,OD280,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [131]:
y= df["Class"]
X= df.drop('Class', axis=1)

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X,y , random_state =24, test_size=0.3, stratify = y)
lda.fit(X_train, y_train)

In [141]:
y_pred = lda.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [163]:
qda.fit(X_train, y_train)
y_pred = qda.predict_proba(X_test)
log_loss(y_test, y_pred)

0.05406836184849984

## Yeast Dataset

In [166]:
df = pd.read_csv("C:/Users/DAI.STUDENTSDC/Downloads/archive (4)/yeast.csv")

In [168]:
df.head()

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,name
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT


In [174]:
y= df.name
X= df.drop('name',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y , random_state =24, test_size=0.3, stratify = y)
lda.fit(X_train, y_train)

In [180]:
y_pred= lda.predict(X_test)
accuracy_score(y_test, y_pred)

0.5672645739910314

In [182]:
qda.fit(X_train, y_train)
y_pred= qda.predict(X_test)
accuracy_score(y_test, y_pred)



0.20179372197309417

## Satellite Imaging

In [194]:
df= pd.read_csv('Cases/Satellite Imaging/Satellite.csv', sep = ";")

In [196]:
df.head()

Unnamed: 0,x.1,x.2,x.3,x.4,x.5,x.6,x.7,x.8,x.9,x.10,...,x.28,x.29,x.30,x.31,x.32,x.33,x.34,x.35,x.36,classes
0,92,115,120,94,84,102,106,79,84,102,...,104,88,121,128,100,84,107,113,87,grey soil
1,84,102,106,79,84,102,102,83,80,102,...,100,84,107,113,87,84,99,104,79,grey soil
2,84,102,102,83,80,102,102,79,84,94,...,87,84,99,104,79,84,99,104,79,grey soil
3,80,102,102,79,84,94,102,79,80,94,...,79,84,99,104,79,84,103,104,79,grey soil
4,84,94,102,79,80,94,98,76,80,102,...,79,84,103,104,79,79,107,109,87,grey soil


In [202]:
X= df.drop('classes',axis=1)
y= df.classes
X_train, X_test, y_train, y_test = train_test_split(X,y , random_state =24, test_size=0.3, stratify = y)

In [204]:
lda.fit(X_train, y_train)
y_pred= lda.predict(X_test)
accuracy_score(y_test, y_pred)

0.8296219575349559

In [206]:
qda.fit(X_train, y_train)
y_pred= qda.predict(X_test)
accuracy_score(y_test, y_pred)

0.8508544795442776

In [209]:
df.classes.unique()

array(['grey soil', 'damp grey soil', 'vegetation stubble',
       'very damp grey soil', 'cotton crop', 'red soil'], dtype=object)

In [217]:
np.unique(y_pred)

array(['cotton crop', 'damp grey soil', 'grey soil', 'red soil',
       'vegetation stubble', 'very damp grey soil'], dtype=object)

In [225]:
knn = KNeighborsClassifier(n_neighbors = 3)
pipe = Pipeline([('LDA', lda), ('KNN', knn)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)

0.8663904712584153

In [227]:
y_pred_proba = pipe.predict_proba(X_test)
log_loss(y_test, y_pred_proba)

2.2447596256386877

In [245]:
knn = KNeighborsClassifier(n_neighbors = 3)
pipe = Pipeline([('LDA', lda), ('KNN', knn)])
pipe.fit(X_train, y_train)
kfold = StratifiedKFold(n_splits = 5, shuffle = True)
params = {'KNN__n_neighbors': np.arange(1,9)}
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, verbose = 2)
gcv.fit(X,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END .................................KNN__n_neighbors=1; total time=   0.0s
[CV] END .................................KNN__n_neighbors=1; total time=   0.1s
[CV] END .................................KNN__n_neighbors=1; total time=   0.0s
[CV] END .................................KNN__n_neighbors=1; total time=   0.0s
[CV] END .................................KNN__n_neighbors=1; total time=   0.0s
[CV] END .................................KNN__n_neighbors=2; total time=   0.0s
[CV] END .................................KNN__n_neighbors=2; total time=   0.0s
[CV] END .................................KNN__n_neighbors=2; total time=   0.1s
[CV] END .................................KNN__n_neighbors=2; total time=   0.0s
[CV] END .................................KNN__n_neighbors=2; total time=   0.1s
[CV] END .................................KNN__n_neighbors=3; total time=   0.1s
[CV] END .................................KNN__n_

In [247]:
gcv.best_score_

0.8814296814296814

In [249]:
gcv.best_params_

{'KNN__n_neighbors': 6}

In [251]:
knn = KNeighborsClassifier(n_neighbors = 3)
pipe = Pipeline([('LDA', lda), ('KNN', knn)])
pipe.fit(X_train, y_train)
kfold = StratifiedKFold(n_splits = 5, shuffle = True)
params = {'KNN__n_neighbors': np.arange(1,9)}
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, verbose = 2, scoring = 'neg_log_loss')
gcv.fit(X,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END .................................KNN__n_neighbors=1; total time=   0.0s
[CV] END .................................KNN__n_neighbors=1; total time=   0.0s
[CV] END .................................KNN__n_neighbors=1; total time=   0.0s
[CV] END .................................KNN__n_neighbors=1; total time=   0.0s
[CV] END .................................KNN__n_neighbors=1; total time=   0.0s
[CV] END .................................KNN__n_neighbors=2; total time=   0.0s
[CV] END .................................KNN__n_neighbors=2; total time=   0.0s
[CV] END .................................KNN__n_neighbors=2; total time=   0.0s
[CV] END .................................KNN__n_neighbors=2; total time=   0.0s
[CV] END .................................KNN__n_neighbors=2; total time=   0.0s
[CV] END .................................KNN__n_neighbors=3; total time=   0.0s
[CV] END .................................KNN__n_

In [253]:
print(gcv.best_score_, gcv.best_params_, sep = '\n')

-0.9315447993634309
{'KNN__n_neighbors': 8}


### LDA

In [17]:
milk

NameError: name 'milk' is not defined