In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV
from pandas_profiling import ProfileReport
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/aniruddhachoudhury/Red-Wine-Quality/master/winequality-red.csv")
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
x = df.drop(columns = "quality")
x

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [4]:
y = df.quality
y

0       5
1       5
2       5
3       6
4       5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: quality, Length: 1599, dtype: int64

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=20)

# KNN

In [6]:
knn = KNeighborsClassifier()

In [7]:
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [8]:
knn.score(x_test, y_test)

0.5175

In [9]:
params = {
    
    "n_neighbors":[3,5,7,9,12,13,15,17,21]
}

In [10]:
grid_cv = GridSearchCV(knn, param_grid=params)

In [11]:
grid_cv.fit(x_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [3, 5, 7, 9, 12, 13, 15, 17, 21]})

In [12]:
grid_cv.best_params_

{'n_neighbors': 17}

In [13]:
knn_new = KNeighborsClassifier(n_neighbors=17)

In [14]:
knn_new.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=17)

In [15]:
knn_new.score(x_test, y_test)

0.555

In [16]:
knn_new = KNeighborsClassifier(n_neighbors=17, p = 1)

#### p = 1 simply means we are opting out for manhattan distance....n_jobs is for CPU core utilisation,,,-1 is default which means it would utilise all of my cores

In [17]:
knn_new.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=17, p=1)

In [18]:
knn_new.score(x_test, y_test)

0.56

In [19]:
params = {
    
    "n_neighbors":[3,5,7,9,12,13,15,17,21],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "p": [1,2],
    "leaf_size": range(10,51,5),
    "weights": ["uniform", "distance"]
}

In [20]:
grid_cv_new = GridSearchCV(knn, param_grid= params)

In [21]:
grid_cv_new.fit(x_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': range(10, 51, 5),
                         'n_neighbors': [3, 5, 7, 9, 12, 13, 15, 17, 21],
                         'p': [1, 2], 'weights': ['uniform', 'distance']})

In [22]:
grid_cv_new.best_params_

{'algorithm': 'auto',
 'leaf_size': 10,
 'n_neighbors': 21,
 'p': 1,
 'weights': 'distance'}

In [23]:
knn_newest = KNeighborsClassifier(n_neighbors =21, p =1, algorithm = "auto", leaf_size = 10, weights = "distance")

In [24]:
knn_newest.fit(x_train, y_train)

KNeighborsClassifier(leaf_size=10, n_neighbors=21, p=1, weights='distance')

In [25]:
knn_newest.score(x_test, y_test)

0.6325

In [26]:
# note: not only grid_params, but value of cv can also help us improvise our model

In [27]:
# let us try to save our model

import pickle
pickle.dump(knn_newest, open("knn.pkl", "wb"))

In [28]:
# NOTE: parameters change lead to change in size of the model
# Since KNN is a lazy learner, it holds/ stores datasets...therefore it has high size
## so it is not advisable to use KNN algo in case of huge dataset

In [29]:
knn_newest.predict(x_test)

array([6, 5, 5, 5, 5, 6, 5, 6, 5, 6, 6, 5, 6, 5, 5, 5, 5, 6, 6, 6, 5, 6,
       6, 5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 7, 6, 5, 6, 6, 5, 5, 6, 5,
       7, 5, 6, 6, 6, 6, 7, 5, 5, 5, 6, 6, 5, 5, 5, 6, 5, 5, 6, 5, 5, 5,
       5, 5, 6, 6, 5, 6, 6, 6, 5, 5, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, 5, 5,
       5, 5, 5, 6, 5, 6, 6, 6, 5, 6, 7, 5, 5, 5, 6, 6, 6, 6, 6, 5, 5, 6,
       5, 6, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 6, 6, 6, 5, 6, 7, 6, 5,
       5, 6, 6, 5, 6, 6, 5, 7, 5, 6, 6, 5, 5, 6, 6, 6, 5, 6, 5, 5, 5, 6,
       6, 6, 5, 5, 5, 6, 6, 6, 6, 6, 5, 7, 5, 5, 5, 5, 5, 6, 5, 6, 6, 6,
       6, 6, 5, 5, 7, 6, 7, 5, 5, 6, 5, 5, 6, 5, 6, 5, 6, 6, 5, 7, 5, 6,
       6, 6, 6, 5, 6, 5, 6, 5, 6, 6, 5, 5, 6, 6, 5, 5, 6, 5, 6, 5, 5, 5,
       5, 5, 5, 6, 6, 7, 5, 6, 7, 6, 5, 5, 7, 6, 5, 5, 6, 6, 6, 5, 6, 7,
       7, 5, 6, 6, 5, 6, 5, 6, 5, 5, 6, 5, 5, 6, 7, 5, 5, 6, 6, 5, 5, 5,
       6, 6, 6, 7, 5, 5, 5, 7, 7, 6, 5, 7, 6, 5, 6, 5, 6, 6, 6, 5, 5, 5,
       5, 5, 5, 6, 6, 6, 6, 7, 7, 5, 5, 5, 6, 5, 5,

In [30]:
knn_newest.score(x_test,y_test)

0.6325

In [31]:
knn_newest.predict([x_test.iloc[0]])

array([6], dtype=int64)

# SVM 

In [32]:
from sklearn.svm import SVC # SVC means support vector classifier

In [33]:
svc = SVC()
svc.fit(x_train, y_train)
svc.score(x_test, y_test)

0.5

In [34]:
params = {
    
    "kernel" : ["rbf", "linear", "sigmoid", "poly"]
    
}

svm_grid = GridSearchCV(svc, param_grid= params)
svm_grid.fit(x_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'kernel': ['rbf', 'linear', 'sigmoid', 'poly']})

In [35]:
svm_grid.best_params_

{'kernel': 'linear'}

In [36]:
svc1 = SVC(kernel = "linear")
svc1.fitx_trainrain, y_train)
svc1.score(x_test, y_test)

0.5925

In [None]:
# c is a regularization parameter like l1, l2 etc...it must be positive
# gama is a parameter dedicated only for rbf, poly and sigmoid kernels, 
# it is just a constant to :
        # select a suitable separation distance
        # control 3rd dimension
# degree is a parameter dedicated only for poly kernel

params = {
    
    "kernel" : ["rbf", "linear", "sigmoid", "poly"],
    "C": [0.1, 0.4, 0.6, 1,2,3,100,200,500],
    "gamma": [0.001,0.1, 0.004, 0.003, 0.04]
    
}

svm_grid = GridSearchCV(svc, param_grid= params, verbose=3)
svm_grid.fit(x_train, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV 1/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.529 total time=   0.1s
[CV 2/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.504 total time=   0.1s
[CV 3/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.475 total time=   0.1s
[CV 4/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.463 total time=   0.1s
[CV 5/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.460 total time=   0.1s
[CV 1/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.600 total time=   0.5s
[CV 2/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.525 total time=   0.7s
[CV 3/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.575 total time=   0.6s
[CV 4/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.600 total time=   0.7s
[CV 5/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.548 total time=   1.0s
[CV 1/5] END C=0.1, gamma=0.001, kernel=sigmoid;, score=0.338 total time=   0.0s
[CV 2/5] END C=0.1, gamma=0.001, kernel=sigmoi

In [1]:
#NOTE (OUM Out Of Memory Issue): What to do if GPU time runout while training your CNN?
    
    # Decrease batch size,
    # increase number of iterations by decreasing a batch
    # Try to use mini batch instead of batch or stockistic
    # reducing number of data that we are trying to give
    # changing the dimension of data (like repalce rgb in 3D data with grey)
    
# Early stopping means to stop at a place beyond which error is constant and 
# not able to learn something, but still sending data again and again, 
# i.e. not to try anymore with different possible permutations and combinations.

# SVR  (Support Vector Regressor)

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV
from pandas_profiling import ProfileReport
from sklearn.svm import SVR

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/srinivasav22/Graduate-Admission-Prediction/master/Admission_Predict_Ver1.1.csv")
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332,108,5,4.5,4.0,9.02,1,0.87
496,497,337,117,5,5.0,5.0,9.87,1,0.96
497,498,330,120,5,4.5,5.0,9.56,1,0.93
498,499,312,103,4,4.0,5.0,8.43,0,0.73


In [18]:
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332,108,5,4.5,4.0,9.02,1,0.87
496,497,337,117,5,5.0,5.0,9.87,1,0.96
497,498,330,120,5,4.5,5.0,9.56,1,0.93
498,499,312,103,4,4.0,5.0,8.43,0,0.73


In [26]:
x = df.drop(columns=["Serial No.", "Chance of Admit "], axis=1)
x

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337,118,4,4.5,4.5,9.65,1
1,324,107,4,4.0,4.5,8.87,1
2,316,104,3,3.0,3.5,8.00,1
3,322,110,3,3.5,2.5,8.67,1
4,314,103,2,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332,108,5,4.5,4.0,9.02,1
496,337,117,5,5.0,5.0,9.87,1
497,330,120,5,4.5,5.0,9.56,1
498,312,103,4,4.0,5.0,8.43,0


In [27]:
y = df["Chance of Admit "]

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=30, test_size= 0.20)  

In [29]:
svr = SVR()

In [30]:
svr.fit(x_train, y_train)

SVR()

In [31]:
svr.predict(x_test)

array([0.79359894, 0.56840662, 0.70121728, 0.64051776, 0.82877673,
       0.75720511, 0.65919306, 0.61314479, 0.73137208, 0.7450226 ,
       0.83095028, 0.68214796, 0.77285966, 0.88845842, 0.80998244,
       0.8247482 , 0.55105976, 0.5986574 , 0.75757418, 0.7159455 ,
       0.70815123, 0.70214425, 0.7920094 , 0.80912574, 0.7717613 ,
       0.56683869, 0.63246794, 0.64924208, 0.62574847, 0.58584348,
       0.6697305 , 0.53829813, 0.78952432, 0.63965643, 0.66426546,
       0.66289981, 0.65128345, 0.79695115, 0.5501086 , 0.66026974,
       0.57509407, 0.61435263, 0.67883631, 0.66912539, 0.78486554,
       0.65539903, 0.91631603, 0.63191225, 0.62692592, 0.77809513,
       0.88682892, 0.69506213, 0.79551633, 0.69391549, 0.67678932,
       0.88428817, 0.63590511, 0.53807879, 0.62764507, 0.71415275,
       0.78007972, 0.68890559, 0.73340895, 0.64024814, 0.70710604,
       0.66720127, 0.66543166, 0.62400627, 0.86396078, 0.6824248 ,
       0.52723608, 0.871495  , 0.64914782, 0.53984195, 0.64897

In [32]:
svr.score(x_test, y_test)

0.6851176591184742

#### but this mtd is not correct to find score in case of regression

In [33]:
df.isna().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [34]:
# this means our data is preprocessed
#vanila flavor means no hyperparameter tuning

In [35]:
#let us try to use r square to check the accuracy of the SVR model

In [36]:
from sklearn.metrics import r2_score

In [37]:
r2_score(y_test, svr.predict(x_test))

0.6851176591184742

In [38]:
# however we can see that the accuracy score without using r2_score is same as using r2_score

In [39]:
svr.predict([x.iloc[0]])

array([0.88845842])

In [40]:
y[0]

0.92

In [41]:
# we can see our model is giving almost similar result like expected

# Stacking

In [42]:
# in stacking, we have freedom to use different types of algos as base estimator or base algo

In [44]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [45]:
df = pd.read_csv("https://raw.githubusercontent.com/aniruddhachoudhury/Red-Wine-Quality/master/winequality-red.csv")
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [46]:
x=df.drop(columns ="quality")
y=df.quality

In [47]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=30, test_size= 0.15)

In [48]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [49]:
knn.score(x_test, y_test)

0.5083333333333333

In [50]:
svc = SVC()
svc.fit(x_train, y_train)

SVC()

In [51]:
svc.score(x_test, y_test)

0.5125

In [53]:
# knn.predict(x_test) # here data passed into for prediction will be diferent ...
#in satcking, we do to atleast 2 levels of data division generally...
# we are supposed to divide original data into 1st division
# and then subset of divided data is taken for train - test split

In [54]:
train, val_train, test, val_test = train_test_split(x,y, test_size=0.50, random_state=30)

In [55]:
x_train, x_test, y_train, y_test = train_test_split(train, test, random_state=30, test_size=0.15)

In [56]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [57]:
knn.score(x_test, y_test)

0.5833333333333334

In [58]:
prediction_knn = knn.predict(val_train)
prediction_svc = svc.predict(val_train)

In [59]:
np.column_stack((prediction_knn, prediction_svc))

array([[7, 6],
       [5, 6],
       [6, 6],
       ...,
       [6, 5],
       [6, 6],
       [5, 5]], dtype=int64)

In [60]:
input3 = np.column_stack((prediction_knn, prediction_svc))

In [61]:
output = val_test

In [62]:
pd.DataFrame(input3)

Unnamed: 0,0,1
0,7,6
1,5,6
2,6,6
3,5,5
4,5,6
...,...,...
795,5,6
796,5,6
797,6,5
798,6,6


In [63]:
pd.DataFrame(output)

Unnamed: 0,quality
1147,7
659,4
871,5
1333,5
1411,6
...,...
1073,6
200,7
942,7
1106,6


In [64]:
rf = RandomForestClassifier()
rf.fit(input3, output)

RandomForestClassifier()

In [66]:
knn_output = knn.predict(x_test)
svc_output = svc.predict(x_test)
np.column_stack((knn_output, svc_output))

array([[5, 5],
       [6, 6],
       [5, 5],
       [5, 5],
       [6, 6],
       [6, 6],
       [5, 6],
       [6, 6],
       [6, 6],
       [5, 5],
       [6, 6],
       [5, 6],
       [5, 5],
       [5, 6],
       [6, 6],
       [6, 6],
       [6, 6],
       [5, 6],
       [5, 6],
       [6, 6],
       [5, 6],
       [7, 6],
       [6, 6],
       [5, 5],
       [6, 5],
       [5, 6],
       [7, 6],
       [6, 6],
       [5, 5],
       [5, 6],
       [6, 6],
       [5, 5],
       [6, 6],
       [5, 6],
       [6, 6],
       [6, 6],
       [5, 5],
       [5, 5],
       [6, 6],
       [5, 5],
       [5, 6],
       [6, 6],
       [5, 6],
       [5, 6],
       [6, 6],
       [7, 6],
       [5, 5],
       [5, 5],
       [6, 6],
       [5, 6],
       [6, 6],
       [5, 6],
       [6, 6],
       [5, 5],
       [6, 6],
       [6, 6],
       [5, 6],
       [6, 6],
       [6, 6],
       [5, 6],
       [5, 5],
       [5, 6],
       [5, 6],
       [7, 6],
       [6, 6],
       [5, 6],
       [5,

In [68]:
output_stack1 = np.column_stack((knn_output, svc_output))

In [70]:
rf.predict(output_stack1)

array([5, 6, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 5, 5, 6, 6, 6, 5, 6, 6, 5, 6, 6, 6, 6, 5, 5, 6, 5, 6, 6, 6, 6,
       6, 6, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6,
       6, 5, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 6, 6, 5, 6, 5, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 5, 6, 5, 5, 6, 6, 6,
       5, 6, 6, 5, 6, 6, 6, 6, 6, 5], dtype=int64)

In [71]:
rf.score(output_stack1, y_test)

0.5333333333333333

In [72]:
"""###homework : activity recognition with healthy older people using a batteryless wearable sensor dataset
    
    using LR, KNN, SVC, DT, stacking, bagging"""

'###homework : activity recognition with healthy older people using a batteryless wearable sensor dataset\n    \n    using LR, KNN, SVC, DT, stacking, bagging'