### This is just a brief Notebook for appyling Logistic Regression and hyperparameter tuning using GridSearchCV 

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Loading and exploring the data

In [2]:
df = pd.read_csv('b_depressed.csv')
df.sample(10)

Unnamed: 0,Survey_id,Ville_id,sex,Age,Married,Number_children,education_level,total_members,gained_asset,durable_asset,...,incoming_salary,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen,depressed
1052,703,23,1,65,1,1,5,2,28912201,83279922,...,0,1,0,0,26692283,22243569,0,89686073,58278149.0,1
1395,110,16,1,20,1,2,9,4,10409991,60858406,...,0,1,0,0,24023056,45777268,0,58404114,65436134.0,0
53,588,52,1,53,1,3,8,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1063,620,14,1,26,0,3,7,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1387,1062,214,1,28,1,4,12,6,30108896,22861940,...,0,0,0,1,53384566,51960976,0,50556335,54349934.0,1
1161,532,48,1,67,1,0,8,3,25096085,24023054,...,1,0,0,0,10676913,88974275,1,33048987,79098129.0,1
1206,702,107,1,30,0,8,14,10,20980135,64061481,...,0,1,0,1,16329004,11537184,0,13417872,2458971.0,0
1077,1251,200,1,27,0,3,10,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1072,712,109,1,26,1,3,9,5,22375139,12812296,...,0,0,0,0,52049952,3558971,0,1553027,5961277.0,0
569,604,56,1,34,1,3,14,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1429 entries, 0 to 1428
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Survey_id              1429 non-null   int64  
 1   Ville_id               1429 non-null   int64  
 2   sex                    1429 non-null   int64  
 3   Age                    1429 non-null   int64  
 4   Married                1429 non-null   int64  
 5   Number_children        1429 non-null   int64  
 6   education_level        1429 non-null   int64  
 7   total_members          1429 non-null   int64  
 8   gained_asset           1429 non-null   int64  
 9   durable_asset          1429 non-null   int64  
 10  save_asset             1429 non-null   int64  
 11  living_expenses        1429 non-null   int64  
 12  other_expenses         1429 non-null   int64  
 13  incoming_salary        1429 non-null   int64  
 14  incoming_own_farm      1429 non-null   int64  
 15  inco

In [4]:
df.isnull().sum()

Survey_id                 0
Ville_id                  0
sex                       0
Age                       0
Married                   0
Number_children           0
education_level           0
total_members             0
gained_asset              0
durable_asset             0
save_asset                0
living_expenses           0
other_expenses            0
incoming_salary           0
incoming_own_farm         0
incoming_business         0
incoming_no_business      0
incoming_agricultural     0
farm_expenses             0
labor_primary             0
lasting_investment        0
no_lasting_investmen     20
depressed                 0
dtype: int64

### Removing 20 rows with null values

In [5]:
df.drop(df[(df['no_lasting_investmen'].isnull())].index, axis = 0, inplace = True)
df.isnull().sum()

Survey_id                0
Ville_id                 0
sex                      0
Age                      0
Married                  0
Number_children          0
education_level          0
total_members            0
gained_asset             0
durable_asset            0
save_asset               0
living_expenses          0
other_expenses           0
incoming_salary          0
incoming_own_farm        0
incoming_business        0
incoming_no_business     0
incoming_agricultural    0
farm_expenses            0
labor_primary            0
lasting_investment       0
no_lasting_investmen     0
depressed                0
dtype: int64

In [6]:
df.columns

Index(['Survey_id', 'Ville_id', 'sex', 'Age', 'Married', 'Number_children',
       'education_level', 'total_members', 'gained_asset', 'durable_asset',
       'save_asset', 'living_expenses', 'other_expenses', 'incoming_salary',
       'incoming_own_farm', 'incoming_business', 'incoming_no_business',
       'incoming_agricultural', 'farm_expenses', 'labor_primary',
       'lasting_investment', 'no_lasting_investmen', 'depressed'],
      dtype='object')

## Scaling large values 

In [7]:
df = pd.get_dummies(df, columns=['Ville_id', 'education_level'])

In [8]:
cols = df.columns.values
large_data = [i for i in cols if df[i].mean() > 10000]
scaler = StandardScaler()
df_std = scaler.fit_transform(df[large_data])
df[large_data]= df_std 

## Target and feature values

In [9]:
X = df.drop(['depressed','Survey_id'], axis = 1)
y = df['depressed']

# GridSearchCV for finding the best parameters

In [10]:
lg_model = LogisticRegression()
param_grid = [    
    {'penalty' : ['l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','sag'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

## what is C?
In our case which the data is not too much (1429 rows) we use different range of "C" to reduce the "Overfitting". A high C means "Trust this training data a lot", while a low value says "This data may not be fully representative of the real world data, so if it's telling you to make a parameter really large, don't listen to it". [here](http:/https://stackoverflow.com/questions/67513075/what-is-c-parameter-in-sklearn-logistic-regression/)

In [11]:
clf =  GridSearchCV(lg_model, param_grid = param_grid, cv = 3, n_jobs=-1);  #I concider n_job as -1 to use all processors and make it a little faster.

#### When we perform hyperparameter tuning information about dataset still **'leaks'** into the algorithm. SO, before doing the GridSearchCV, I decided to split the train and test data.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 100)

### Fitting the GridSearchCV on train data

In [13]:
best_clf = clf.fit(X_train,y_train)

720 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sahoo\anaconda3\envs\iimenv1\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sahoo\anaconda3\envs\iimenv1\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\sahoo\anaconda3\envs\iimenv1\lib\site-packages\sklearn\linear_model\_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, 

## Let's see the best estimator and params

In [14]:
best_clf.best_estimator_
best_clf.best_params_

{'C': 0.0001, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}

## Accuracy for training data

In [15]:
print (f'Accuracy for training data : {best_clf.score(X_train,y_train):.3f}')

Accuracy for training data : 0.826


### Applying the best parameters on test data

In [16]:
clf2 = LogisticRegression(penalty = 'l2', C = 0.0001,solver = 'lbfgs', max_iter= 100)
test_clf = clf2.fit(X_test,y_test)

In [17]:
print (f'Accuracy for test data is : {test_clf.score(X_test,y_test):.3f}')

Accuracy for test data is : 0.856


### Confusion Matrix

In [18]:
y_pred = clf2.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[302   0]
 [ 51   0]]


In [24]:
from sklearn.svm import SVC

In [25]:
classifier = SVC(kernel='linear')
# model = SVC()

In [26]:
#training the support vector Machine Classifier
classifier.fit(X_train, y_train)

SVC(kernel='linear')

In [28]:
# accuracy score on the training data
from sklearn.metrics import accuracy_score
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

In [29]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.84375


In [30]:
import pickle

filename = 'depression_model.sav'
pickle.dump(clf2, open(filename, 'wb'))

### Even after iterating over different parameters the accuracy is still down and the model did not predict 51 depressed cases. Any idea?