**Setup**


In [1]:
#Common Imports
import numpy as np
import pandas as pd

np.random.seed(825340)

**Importing the dataset baseball.csv**

In [2]:
#I will predict the "attendance_binary" value in the data set:
baseball = pd.read_csv("baseball.csv")
baseball.head()

Unnamed: 0,attendance_binary,previous_attendance,previous_away_team_errors,previous_away_team_hits,previous_away_team_runs,game_type,previous_game_type,previous_home_team_errors,previous_home_team_hits,previous_home_team_runs,game_day,previous_game_day,temperature,wind_speed,sky,previous_game_duration,previous_homewin
0,0,43683,2,6,2,Night Game,Day Game,0,6,6,Wednesday,Monday,55,24,Overcast,2.933333,1
1,0,45785,0,7,2,Night Game,Day Game,0,10,3,Wednesday,Monday,48,7,Unknown,2.8,1
2,0,48282,0,8,4,Night Game,Day Game,2,4,3,Wednesday,Monday,65,10,Cloudy,3.383333,0
3,0,21830,0,9,6,Day Game,Night Game,0,15,11,Wednesday,Tuesday,77,0,In Dome,3.233333,1
4,0,49289,2,4,2,Night Game,Day Game,1,1,3,Tuesday,Monday,81,12,Cloudy,2.633333,1


In [3]:
baseball

Unnamed: 0,attendance_binary,previous_attendance,previous_away_team_errors,previous_away_team_hits,previous_away_team_runs,game_type,previous_game_type,previous_home_team_errors,previous_home_team_hits,previous_home_team_runs,game_day,previous_game_day,temperature,wind_speed,sky,previous_game_duration,previous_homewin
0,0,43683,2,6,2,Night Game,Day Game,0,6,6,Wednesday,Monday,55,24,Overcast,2.933333,1
1,0,45785,0,7,2,Night Game,Day Game,0,10,3,Wednesday,Monday,48,7,Unknown,2.800000,1
2,0,48282,0,8,4,Night Game,Day Game,2,4,3,Wednesday,Monday,65,10,Cloudy,3.383333,0
3,0,21830,0,9,6,Day Game,Night Game,0,15,11,Wednesday,Tuesday,77,0,In Dome,3.233333,1
4,0,49289,2,4,2,Night Game,Day Game,1,1,3,Tuesday,Monday,81,12,Cloudy,2.633333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2422,1,44020,0,11,9,Night Game,Night Game,1,3,0,Tuesday,Monday,76,8,Unknown,3.133333,0
2423,0,31042,2,7,5,Night Game,Day Game,1,7,3,Monday,Sunday,72,0,In Dome,2.850000,0
2424,0,24123,0,7,2,Day Game,Night Game,0,6,1,Wednesday,Tuesday,66,0,In Dome,2.483333,0
2425,0,36911,1,12,8,Day Game,Night Game,2,10,7,Wednesday,Tuesday,71,0,In Dome,3.783333,0


**Splitting the dataset into train and test**

In [4]:
from sklearn.model_selection import train_test_split

training_set, testing_set = train_test_split(baseball, test_size=0.3)

In [5]:
#Checking the missing values in the training dataset
training_set.isna().sum()

attendance_binary            0
previous_attendance          0
previous_away_team_errors    0
previous_away_team_hits      0
previous_away_team_runs      0
game_type                    0
previous_game_type           0
previous_home_team_errors    0
previous_home_team_hits      0
previous_home_team_runs      0
game_day                     0
previous_game_day            0
temperature                  0
wind_speed                   0
sky                          0
previous_game_duration       0
previous_homewin             0
dtype: int64

In [6]:
#Checking the missing values in the testing dataset
testing_set.isna().sum()

attendance_binary            0
previous_attendance          0
previous_away_team_errors    0
previous_away_team_hits      0
previous_away_team_runs      0
game_type                    0
previous_game_type           0
previous_home_team_errors    0
previous_home_team_hits      0
previous_home_team_runs      0
game_day                     0
previous_game_day            0
temperature                  0
wind_speed                   0
sky                          0
previous_game_duration       0
previous_homewin             0
dtype: int64

In [7]:
training_set.shape

(1698, 17)

In [8]:
testing_set.shape

(729, 17)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [10]:
#Seperating the target variable as We do not want to transform it
train_target_set = training_set[['attendance_binary']]
test_target_set = testing_set[['attendance_binary']]

train_inputs_set = training_set.drop(['attendance_binary'], axis=1)
test_inputs_set = testing_set.drop(['attendance_binary'], axis=1)

In [11]:
#Checking the target in training dataset
train_target_set

Unnamed: 0,attendance_binary
603,1
1444,0
216,0
132,1
449,0
...,...
1517,0
645,1
1242,1
462,0


In [12]:
#Checking the target in testing dataset
test_target_set

Unnamed: 0,attendance_binary
543,0
2099,0
1835,1
571,1
305,0
...,...
1194,0
381,1
391,1
1004,1


In [13]:
#Checking the inputs in training dataset
train_inputs_set

Unnamed: 0,previous_attendance,previous_away_team_errors,previous_away_team_hits,previous_away_team_runs,game_type,previous_game_type,previous_home_team_errors,previous_home_team_hits,previous_home_team_runs,game_day,previous_game_day,temperature,wind_speed,sky,previous_game_duration,previous_homewin
603,43397,0,7,4,Night Game,Night Game,1,1,1,Tuesday,Monday,82,6,Cloudy,2.283333,0
1444,30012,1,15,7,Night Game,Day Game,0,8,3,Friday,Thursday,73,3,Cloudy,3.116667,0
216,23983,1,6,1,Night Game,Night Game,0,8,2,Saturday,Friday,80,7,Cloudy,3.700000,1
132,32189,1,6,3,Night Game,Night Game,0,10,4,Wednesday,Tuesday,89,11,Cloudy,2.800000,1
449,29934,0,4,1,Night Game,Night Game,0,6,2,Tuesday,Monday,85,5,Sunny,2.583333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1517,33040,1,14,6,Day Game,Night Game,0,14,8,Saturday,Saturday,81,5,Sunny,3.250000,1
645,28469,0,7,4,Night Game,Night Game,0,9,5,Saturday,Friday,88,4,Cloudy,3.516667,1
1242,31456,1,16,11,Night Game,Day Game,1,8,3,Thursday,Wednesday,83,13,Unknown,3.450000,0
462,30137,0,12,6,Day Game,Night Game,3,8,1,Sunday,Saturday,74,2,Cloudy,3.466667,0


In [14]:
#Checking the target in testing dataset
test_inputs_set

Unnamed: 0,previous_attendance,previous_away_team_errors,previous_away_team_hits,previous_away_team_runs,game_type,previous_game_type,previous_home_team_errors,previous_home_team_hits,previous_home_team_runs,game_day,previous_game_day,temperature,wind_speed,sky,previous_game_duration,previous_homewin
543,32304,0,5,2,Day Game,Night Game,0,7,1,Sunday,Saturday,63,1,Cloudy,2.550000,0
2099,17043,1,9,3,Night Game,Night Game,0,8,4,Thursday,Wednesday,74,4,Sunny,2.800000,1
1835,30781,2,4,6,Night Game,Night Game,0,8,2,Saturday,Friday,87,6,Sunny,2.533333,0
571,42159,0,10,5,Night Game,Night Game,0,8,0,Saturday,Friday,83,6,Sunny,3.250000,0
305,15247,0,13,15,Night Game,Night Game,1,5,2,Friday,Thursday,70,11,Sunny,2.900000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194,31978,1,18,15,Night Game,Day Game,3,9,5,Monday,Sunday,88,15,Overcast,3.216667,0
381,20225,0,11,5,Day Game,Day Game,0,8,3,Sunday,Saturday,66,2,Cloudy,3.016667,0
391,30857,0,12,7,Day Game,Night Game,2,7,4,Saturday,Friday,66,4,Overcast,3.133333,0
1004,43134,0,15,9,Night Game,Night Game,2,6,2,Wednesday,Tuesday,84,10,Sunny,2.933333,0


In [15]:
#Identifying the data types of the inputs
train_inputs_set.dtypes

previous_attendance            int64
previous_away_team_errors      int64
previous_away_team_hits        int64
previous_away_team_runs        int64
game_type                     object
previous_game_type            object
previous_home_team_errors      int64
previous_home_team_hits        int64
previous_home_team_runs        int64
game_day                      object
previous_game_day             object
temperature                    int64
wind_speed                     int64
sky                           object
previous_game_duration       float64
previous_homewin               int64
dtype: object

In [16]:
##These are only column names

# Identifying the numerical columns
num_columns = train_inputs_set.select_dtypes(include=[np.number]).columns.to_list()

# Identifying the categorical columns
cat_columns = train_inputs_set.select_dtypes('object').columns.to_list()

In [17]:
# Identifying the binary columns so I can pass them through without transforming
bin_columns = ['previous_homewin']

In [18]:
num_columns

['previous_attendance',
 'previous_away_team_errors',
 'previous_away_team_hits',
 'previous_away_team_runs',
 'previous_home_team_errors',
 'previous_home_team_hits',
 'previous_home_team_runs',
 'temperature',
 'wind_speed',
 'previous_game_duration',
 'previous_homewin']

In [19]:
cat_columns

['game_type', 'previous_game_type', 'game_day', 'previous_game_day', 'sky']

In [20]:
#Since numerical columns already includes the binary columns,Iam removing the binary columns from numerical columns.

for col in bin_columns:
    num_columns.remove(col)

In [21]:
bin_columns

['previous_homewin']

In [22]:
#Checking whether the binary column has been removed from the numeric columns
num_columns

['previous_attendance',
 'previous_away_team_errors',
 'previous_away_team_hits',
 'previous_away_team_runs',
 'previous_home_team_errors',
 'previous_home_team_hits',
 'previous_home_team_runs',
 'temperature',
 'wind_speed',
 'previous_game_duration']

**Pipeline**

In [23]:
 num_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [24]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [25]:
bin_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [26]:
preprocessor = ColumnTransformer([
        ('num', num_transformer, num_columns),
        ('cat', cat_transformer, cat_columns),
        ('binary', bin_transformer, bin_columns)],
        remainder='drop')

In [27]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs_set)

train_x

array([[ 1.31878323, -0.73505738, -0.51158454, ...,  0.        ,
         0.        ,  0.        ],
       [-0.03986171,  0.50933687,  1.79105441, ...,  0.        ,
         0.        ,  0.        ],
       [-0.65183559,  0.50933687, -0.7994144 , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.10671157,  0.50933687,  2.07888428, ...,  0.        ,
         1.        ,  0.        ],
       [-0.02717358, -0.73505738,  0.92756481, ...,  0.        ,
         0.        ,  0.        ],
       [-1.3389232 ,  1.75373111,  0.0640752 , ...,  0.        ,
         0.        ,  1.        ]])

In [28]:
train_x.shape

(1698, 37)

In [29]:
# Transform the test data
test_x = preprocessor.transform(test_inputs_set)

test_x

array([[ 0.19278784, -0.73505738, -1.08724427, ...,  0.        ,
         0.        ,  0.        ],
       [-1.35628056,  0.50933687,  0.0640752 , ...,  1.        ,
         0.        ,  1.        ],
       [ 0.03819566,  1.75373111, -1.37507414, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.04591005, -0.73505738,  0.92756481, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.29208741, -0.73505738,  1.79105441, ...,  1.        ,
         0.        ,  0.        ],
       [-0.89494016,  1.75373111, -1.95073388, ...,  1.        ,
         0.        ,  1.        ]])

In [30]:
test_x.shape

(729, 37)

In [31]:
#Since we are predicting the target variable which is binary, I am using classifier
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_target_set)

In [32]:
from sklearn.metrics import accuracy_score

In [33]:
# This is the baseline Train Accuracy

dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_target_set, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.5223792697290931


In [34]:
# This is the baseline Test Accuracy

dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_target_set, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.5102880658436214


## SVM Model 1:

**Model1 :SVC with "liner" kernel**

In [35]:
from sklearn.svm import SVC
 
linear_svm = SVC(kernel="linear")

linear_svm.fit(train_x, train_target_set)

  y = column_or_1d(y, warn=True)


In [36]:
#Importing the accuracy score
from sklearn.metrics import accuracy_score

**Accuracy metric**

In [37]:
#Predicting the train values
train_y_pred_linear = linear_svm.predict(train_x)

#Train accuracy
accuracy_score(train_target_set, train_y_pred_linear)

0.8274440518256773

In [38]:
#Predicting the test values
test_y_pred_linear = linear_svm.predict(test_x)

#Test accuracy
accuracy_score(test_target_set, test_y_pred_linear)

0.8477366255144033

Here in the above model, I just got lucky due to the split, as my testing dataset accuracy is greater than the training dataset accuracy. And the test dataset accuracy of the model with linear kernel which is 84.77% is far better than the baseline accuracy which is 51.02%

In [39]:
#Finding the confusion matrix for the SVC model with linear kernel
from sklearn.metrics import confusion_matrix
confusion_matrix(test_target_set, test_y_pred_linear)

array([[309,  48],
       [ 63, 309]], dtype=int64)

From the above matrix, I can see that the values in the diagonal passing from top left to bottom right which are also known as true positives and true negatives are greater compared to other values.

In [40]:
#This report displays other metrics used for evaluating the model
from sklearn.metrics import classification_report
print(classification_report(test_target_set, test_y_pred_linear))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       357
           1       0.87      0.83      0.85       372

    accuracy                           0.85       729
   macro avg       0.85      0.85      0.85       729
weighted avg       0.85      0.85      0.85       729



## SVM Model 2:

**Model2 :SVC with "poly" kernel**

**=>Model2:Version 1**

In [41]:
from sklearn.svm import SVC

#Building the SVC model with poly kernel and degree 3
poly_svm_v1 = SVC(kernel="poly", degree=3, coef0=0.01, C=10)

poly_svm_v1.fit(train_x, train_target_set)

  y = column_or_1d(y, warn=True)


**Accuracy metric**

In [42]:
#Predicting the train values
train_y_pred_poly = poly_svm_v1.predict(train_x)

#Train accuracy
accuracy_score(train_target_set, train_y_pred_poly)

0.9734982332155477

In [43]:
#Predicting the test values
test_y_pred_poly = poly_svm_v1.predict(test_x)

#Test accuracy
accuracy_score(test_target_set, test_y_pred_poly)

0.7928669410150891

Here, in the above **Model2: Version 1** , I have observed that the training dataset accuracy is far better than the baseline accuracy and even the testing dataset accuracy is better than baseline accuracy. But I observed **"OVERFITTING"** issue in this model because the difference between the training dataset accuracy and testing dataset accuracy is significantly more. This means the model is learning more and I need to restrict the model from learning more. **I will try by decreasing the degree of the polynomial in the second version**.

**=>Model2:Version 2 (Addressing the OVERFITTING issue)**

In [44]:
from sklearn.svm import SVC

#Building the SVC model with poly kernel and degree 2
poly_svm_v2 = SVC(kernel="poly", degree=2, coef0=0.01, C=10)

poly_svm_v2.fit(train_x, train_target_set)

  y = column_or_1d(y, warn=True)


**Accuracy metric**

In [45]:
#Predicting the train values
train_y_pred_poly = poly_svm_v2.predict(train_x)

#Train accuracy
accuracy_score(train_target_set, train_y_pred_poly)

0.8904593639575972

In [46]:
#Predicting the test values
test_y_pred_poly = poly_svm_v2.predict(test_x)

#Test accuracy
accuracy_score(test_target_set, test_y_pred_poly)

0.823045267489712

Here, in the above **Model2: Version 2** , I have observed that the training dataset accuracy is far better than the baseline accuracy and even the testing dataset accuracy is better than baseline accuracy. And **OVERFITTING** issue which has been observed in Model2: Version 1 has been addressed as there is no significant difference between training dataset accuracy and testing dataset accuracy. This has been achieved by restraining model from learning more.

**=> Model2:Version 3 (After Addressing the OVERFITTING issue in version 2, checking whether I can increase the testing dataset accuracy)**

In [47]:
from sklearn.svm import SVC

#Building the SVC model with poly kernel and degree 2
poly_svm_v3 = SVC(kernel="poly", degree=2, coef0=0.0001, C=10)

poly_svm_v3.fit(train_x, train_target_set)

  y = column_or_1d(y, warn=True)


**Accuracy metric**

In [48]:
#Predicting the train values
train_y_pred_poly = poly_svm_v3.predict(train_x)

#Train accuracy
accuracy_score(train_target_set, train_y_pred_poly)

0.8898704358068316

In [49]:
#Predicting the test values
test_y_pred_poly = poly_svm_v3.predict(test_x)

#Test accuracy
accuracy_score(test_target_set, test_y_pred_poly)

0.823045267489712

Here, in the above **Model2: Version 3** , I have tried decreasing the coef0 value to lessen the influence on the model due to higher degree polynomials. **But there is no much difference between the accuracies across the Model2: Version 2 and Model2: Version 3**.

In [50]:
#Finding the confusion matrix for the SVC model with linear kernel
from sklearn.metrics import confusion_matrix
confusion_matrix(test_target_set, test_y_pred_poly)

array([[295,  62],
       [ 67, 305]], dtype=int64)

From the above matrix, we can see that the values in the diagonal passing from top left to bottom right which are also known as true positives and true negatives are greater compared to other values.

In [51]:
#This report displays other metrics used for evaluating the model
from sklearn.metrics import classification_report
print(classification_report(test_target_set, test_y_pred_poly))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       357
           1       0.83      0.82      0.83       372

    accuracy                           0.82       729
   macro avg       0.82      0.82      0.82       729
weighted avg       0.82      0.82      0.82       729



## SVM Model 3:

**Model3 :SVC with "rbf" kernel**

**=>Model3:Version 1**

In [52]:
rbf_svm_v1 = SVC(kernel="rbf", C=10, gamma='scale')

rbf_svm_v1.fit(train_x, train_target_set)

  y = column_or_1d(y, warn=True)


**Accuracy metric**

In [53]:
#Predicting the train values
train_y_pred_rbf = rbf_svm_v1.predict(train_x)

#Train accuracy
accuracy_score(train_target_set, train_y_pred_rbf)

0.9740871613663133

In [54]:
#Predicting the test values
test_y_pred_rbf = rbf_svm_v1.predict(test_x)

#Test accuracy
accuracy_score(test_target_set, test_y_pred_rbf)

0.8106995884773662

Here, in the above **Model 3: Version 1** , I have observed that the training dataset accuracy is far better than the baseline accuracy and even the testing dataset accuracy is better than baseline accuracy. But I observed **"OVERFITTING"** issue in this model because the difference between the training dataset accuracy and testing dataset accuracy is significantly more. This means the model is learning more and I need to restrict the model from learning more. **I will try by decreasing the gamma value in the second version**.

**=>Model3:Version 2 (Addressing the OVERFITTING issue)**

In [55]:
rbf_svm_v2 = SVC(kernel="rbf", C=10, gamma=0.01)

rbf_svm_v2.fit(train_x, train_target_set)

  y = column_or_1d(y, warn=True)


**Accuracy metric**

In [56]:
#Predicting the train values
train_y_pred = rbf_svm_v2.predict(train_x)

#Train accuracy
accuracy_score(train_target_set, train_y_pred)

0.8510011778563016

In [57]:
#Predicting the test values
test_y_pred = rbf_svm_v2.predict(test_x)

#Test accuracy
accuracy_score(test_target_set, test_y_pred)

0.8381344307270233

Here, in the above **Model3: Version 2** , I have observed that the training dataset accuracy is far better than the baseline accuracy and even the testing dataset accuracy is better than baseline accuracy. And **OVERFITTING** issue which has been observed in Model3: Version 1 has been addressed as there is no significant difference between training dataset accuracy and testing dataset accuracy. This has been achieved by restraining model from learning more.

In [58]:
#Finding the confusion matrix for the SVC model with linear kernel
from sklearn.metrics import confusion_matrix
confusion_matrix(test_target_set, test_y_pred_rbf)

array([[284,  73],
       [ 65, 307]], dtype=int64)

From the above matrix, we can see that the values in the diagonal passing from top left to bottom right which are also known as true positives and true negatives are greater compared to other values.

In [59]:
#This report displays other metrics used for evaluating the model
from sklearn.metrics import classification_report
print(classification_report(test_target_set, test_y_pred_rbf))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80       357
           1       0.81      0.83      0.82       372

    accuracy                           0.81       729
   macro avg       0.81      0.81      0.81       729
weighted avg       0.81      0.81      0.81       729



## SGD Model 1:

In [60]:
from sklearn.linear_model import SGDClassifier 
sgd_logreg_none = SGDClassifier(max_iter=100, penalty=None, eta0=0.01) 

sgd_logreg_none.fit(train_x, train_target_set)

  y = column_or_1d(y, warn=True)


**Accuracy metric**

In [61]:
#Predicting the train values
train_y_pred_none = sgd_logreg_none.predict(train_x)

#Train accuracy
accuracy_score(train_target_set, train_y_pred_none)

0.8209658421672555

In [62]:
#Predict the test values
test_y_pred_none = sgd_logreg_none.predict(test_x)

#Test accuracy
accuracy_score(test_target_set, test_y_pred_none)

0.8395061728395061

In [63]:
#confusion matrix on test set

from sklearn.metrics import confusion_matrix

confusion_matrix(test_target_set, test_y_pred_none)

array([[294,  63],
       [ 54, 318]], dtype=int64)

In [64]:
#create the classification report on test set

from sklearn.metrics import classification_report

print(classification_report(test_target_set, test_y_pred_none))

              precision    recall  f1-score   support

           0       0.84      0.82      0.83       357
           1       0.83      0.85      0.84       372

    accuracy                           0.84       729
   macro avg       0.84      0.84      0.84       729
weighted avg       0.84      0.84      0.84       729



## SGD Model 2: (Change in penalty value)

In [65]:
# tol = stopping criterion
# eta0 = learning rate
# penalty = regularization term
# max_iter = number of passes over training data (i.e., epochs)

sgd_logreg_l2 = SGDClassifier(max_iter=100, penalty='l2', eta0=0.01) 

sgd_logreg_l2.fit(train_x, train_target_set)

  y = column_or_1d(y, warn=True)


**Accuracy metric**

In [66]:
#Predicting the train values
train_y_pred_l2 = sgd_logreg_l2.predict(train_x)

#Train accuracy
accuracy_score(train_target_set, train_y_pred_l2)

0.8150765606595995

In [67]:
#Predict the test values
test_y_pred_l2 = sgd_logreg_l2.predict(test_x)

#Test accuracy
accuracy_score(test_target_set, test_y_pred_l2)

0.8203017832647462

In [68]:
#confusion matrix on test set

from sklearn.metrics import confusion_matrix

confusion_matrix(test_target_set, test_y_pred_l2)

array([[313,  44],
       [ 87, 285]], dtype=int64)

In [69]:
# create the classification report on test set

from sklearn.metrics import classification_report

print(classification_report(test_target_set, test_y_pred_l2))

              precision    recall  f1-score   support

           0       0.78      0.88      0.83       357
           1       0.87      0.77      0.81       372

    accuracy                           0.82       729
   macro avg       0.82      0.82      0.82       729
weighted avg       0.83      0.82      0.82       729



## LogisticRegression Model:

In [70]:
from sklearn.linear_model import LogisticRegression

log_reg_none = LogisticRegression(penalty='none')

log_reg_none.fit(train_x, train_target_set)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Accuracy metric**

In [71]:
from sklearn.metrics import accuracy_score
#Predicting the train values
train_y_pred_log_reg = log_reg_none.predict(train_x)

#Train accuracy
accuracy_score(train_target_set, train_y_pred_log_reg)

0.8256772673733804

In [72]:
#Predict the test values
test_y_pred_log_reg = log_reg_none.predict(test_x)

#Test accuracy
accuracy_score(test_target_set, test_y_pred_log_reg)

0.8532235939643347

In [73]:
from sklearn.metrics import confusion_matrix

confusion_matrix(test_target_set, test_y_pred_log_reg)

array([[311,  46],
       [ 61, 311]], dtype=int64)

In [74]:
from sklearn.metrics import classification_report

print(classification_report(test_target_set, test_y_pred_log_reg))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85       357
           1       0.87      0.84      0.85       372

    accuracy                           0.85       729
   macro avg       0.85      0.85      0.85       729
weighted avg       0.85      0.85      0.85       729



Logistic Regression model performs the best as it has the highest test score compared to all other models.The testing dataset accuracy is 85% which is far better than the baseline testing dataset accuracy which is 51%. There are also no issues like underfitting or overfitting. By looking at the confusion matrix, we can say that the True positives and True negatives rate is more compared to False positives and False negatives.

There is no overfitting in the best model as there is no significant difference between the testing dataset accuracy(85%) and training dataset accuracy(82%). This is because the model is learning the patterns rather than memorizing the values in the training dataset. 

## Is there any evidence of overfitting in the other models (besides the best model), why or why not? 


**1.	 SVM Model 2:**

•	Yes, in **Model2: Version 1(SVC with "poly" kernel(Deg-3,coef-0.01))** model, I have observed that the training dataset accuracy(97%) is far better than the baseline accuracy(52%) and even the testing dataset accuracy(79%) is better than baseline accuracy(51%) but I also observed **"OVERFITTING"** issue in this model because the difference between the training dataset accuracy and testing dataset accuracy is significantly more. This means the model is learning more and I need to restrict the model from learning more. **I will try by decreasing the degree of the polynomial in the second version.**

•	Here, **Model2: Version 2(SVC with "poly" kernel(Deg-2,coef-0.01))**, I have observed that the training dataset accuracy(89%) is far better than the baseline accuracy(52%) and even the testing dataset accuracy(82%) is better than baseline accuracy(51%). And **OVERFITTING** issue which has been observed in Model2: Version 1 has been addressed as there is no significant difference between training dataset accuracy and testing dataset accuracy. This has been achieved by restraining model from learning more.




**2.	 SVM Model 3:**


•	Here, in the **Model 3: Version 1(SVC with "rbf" kernel)**, I have observed that the training dataset accuracy(97%) is far better than the baseline accuracy(52%) and even the testing dataset accuracy(81%) is better than baseline accuracy(51%). But I observed **"OVERFITTING"** issue in this model because the difference between the training dataset accuracy and testing dataset accuracy is significantly more. This means the model is learning more and I need to restrict the model from learning more. **I will try by decreasing the gamma value in the second version.**

•	Here, in the **Model3: Version 2(SVC with "rbf" kernel(gamma-0.01))**, I have observed that the training dataset accuracy(85%) is far better than the baseline accuracy(52%) and even the testing dataset accuracy(83%) is better than baseline accuracy(51%). And **OVERFITTING** issue which has been observed in Model3: Version 1 has been addressed as there is no significant difference between training dataset accuracy and testing dataset accuracy. This has been achieved by restraining model from learning more.



