### This is just a brief Notebook for appyling Logistic Regression and hyperparameter tuning using GridSearchCV 

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Loading and exploring the data

In [3]:
df = pd.read_csv('b_depressed.csv')
df.sample(10)

Unnamed: 0,Survey_id,Ville_id,sex,Age,Married,Number_children,education_level,total_members,gained_asset,durable_asset,...,incoming_salary,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen,depressed
105,940,141,1,23,1,2,8,4,19207828,14557971,...,0,1,0,0,12812296,14680755,0,34566568,14680755.0,0
359,465,70,1,40,0,2,10,3,43026714,13773218,...,1,0,0,0,20019213,83413382,1,18556351,13428887.0,0
1065,789,55,1,24,1,2,9,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0
828,633,54,0,37,0,0,14,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0
386,405,39,1,21,1,1,10,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0
110,96,8,1,17,1,2,9,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0
377,113,10,1,25,1,3,8,5,22485095,24423439,...,0,0,1,1,80076847,64506354,0,24991501,77719032.0,0
584,144,10,1,22,1,5,11,7,23855212,17136446,...,0,1,0,0,11878066,46933932,0,45059564,11099541.0,0
1328,505,77,1,40,1,8,9,10,20019212,25624592,...,0,0,0,0,17083061,18266418,0,66693719,58304844.0,1
570,422,71,1,31,1,4,10,5,28912201,22861940,...,0,0,0,0,30028818,31363432,0,28411718,28292707.0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1429 entries, 0 to 1428
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Survey_id              1429 non-null   int64  
 1   Ville_id               1429 non-null   int64  
 2   sex                    1429 non-null   int64  
 3   Age                    1429 non-null   int64  
 4   Married                1429 non-null   int64  
 5   Number_children        1429 non-null   int64  
 6   education_level        1429 non-null   int64  
 7   total_members          1429 non-null   int64  
 8   gained_asset           1429 non-null   int64  
 9   durable_asset          1429 non-null   int64  
 10  save_asset             1429 non-null   int64  
 11  living_expenses        1429 non-null   int64  
 12  other_expenses         1429 non-null   int64  
 13  incoming_salary        1429 non-null   int64  
 14  incoming_own_farm      1429 non-null   int64  
 15  inco

In [5]:
df.isnull().sum()

Survey_id                 0
Ville_id                  0
sex                       0
Age                       0
Married                   0
Number_children           0
education_level           0
total_members             0
gained_asset              0
durable_asset             0
save_asset                0
living_expenses           0
other_expenses            0
incoming_salary           0
incoming_own_farm         0
incoming_business         0
incoming_no_business      0
incoming_agricultural     0
farm_expenses             0
labor_primary             0
lasting_investment        0
no_lasting_investmen     20
depressed                 0
dtype: int64

### Removing 20 rows with null values

In [6]:
df.drop(df[(df['no_lasting_investmen'].isnull())].index, axis = 0, inplace = True)
df.isnull().sum()

Survey_id                0
Ville_id                 0
sex                      0
Age                      0
Married                  0
Number_children          0
education_level          0
total_members            0
gained_asset             0
durable_asset            0
save_asset               0
living_expenses          0
other_expenses           0
incoming_salary          0
incoming_own_farm        0
incoming_business        0
incoming_no_business     0
incoming_agricultural    0
farm_expenses            0
labor_primary            0
lasting_investment       0
no_lasting_investmen     0
depressed                0
dtype: int64

In [7]:
df.columns

Index(['Survey_id', 'Ville_id', 'sex', 'Age', 'Married', 'Number_children',
       'education_level', 'total_members', 'gained_asset', 'durable_asset',
       'save_asset', 'living_expenses', 'other_expenses', 'incoming_salary',
       'incoming_own_farm', 'incoming_business', 'incoming_no_business',
       'incoming_agricultural', 'farm_expenses', 'labor_primary',
       'lasting_investment', 'no_lasting_investmen', 'depressed'],
      dtype='object')

## Scaling large values 

In [8]:
df = pd.get_dummies(df, columns=['Ville_id', 'education_level'])

In [9]:
cols = df.columns.values
large_data = [i for i in cols if df[i].mean() > 10000]
scaler = StandardScaler()
df_std = scaler.fit_transform(df[large_data])
df[large_data]= df_std 

## Target and feature values

In [10]:
X = df.drop(['depressed','Survey_id'], axis = 1)
y = df['depressed']

# GridSearchCV for finding the best parameters

In [11]:
lg_model = LogisticRegression()
param_grid = [    
    {'penalty' : ['l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','sag'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

## what is C?
In our case which the data is not too much (1429 rows) we use different range of "C" to reduce the "Overfitting". A high C means "Trust this training data a lot", while a low value says "This data may not be fully representative of the real world data, so if it's telling you to make a parameter really large, don't listen to it". [here](http:/https://stackoverflow.com/questions/67513075/what-is-c-parameter-in-sklearn-logistic-regression/)

In [12]:
clf =  GridSearchCV(lg_model, param_grid = param_grid, cv = 3, n_jobs=-1);  #I concider n_job as -1 to use all processors and make it a little faster.

#### When we perform hyperparameter tuning information about dataset still **'leaks'** into the algorithm. SO, before doing the GridSearchCV, I decided to split the train and test data.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 100)

### Fitting the GridSearchCV on train data

In [None]:
best_clf = clf.fit(X_train,y_train)

## Let's see the best estimator and params

In [None]:
best_clf.best_estimator_
best_clf.best_params_

## Accuracy for training data

In [None]:
print (f'Accuracy for training data : {best_clf.score(X_train,y_train):.3f}')

### Applying the best parameters on test data

In [None]:
clf2 = LogisticRegression(penalty = 'l2', C = 0.0001,solver = 'lbfgs', max_iter= 100)
test_clf = clf2.fit(X_test,y_test)

In [None]:
print (f'Accuracy for test data is : {test_clf.score(X_test,y_test):.3f}')

### Confusion Matrix

In [None]:
y_pred = clf2.predict(X_test)
print(confusion_matrix(y_test, y_pred))

### Even after iterating over different parameters the accuracy is still down and the model did not predict 51 depressed cases. Any idea?