In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
datafile_train=r'/Users/lalitsachan/Dropbox/March onwards/PDS V2/Projects/P2/carvan_train.csv'
datafile_test=r'/Users/lalitsachan/Dropbox/March onwards/PDS V2/Projects/P2/carvan_test.csv'
cd_train=pd.read_csv(datafile_train)
cd_test=pd.read_csv(datafile_test)

In [3]:
# all the columns in the data are stored as integers but that 
# does not mean they represent numeric information 
# look at the data dictionary and decide what columns should be considered 
# as categorical and treated accordingly 

# if you read carefully, variables referring to L0 and L2 seem categorical
# If you think some other vars also seem categorical in nature , feel free to create dummies for them 
# in this script however we are treating all the columns as numeric . You can improve on it.

# make sure that when you create dummies , 
# they get created in same count for both train and test set

# many of the columns already binary 0/1, most probably created as dummies 
# from some other original column. you can let them be as is

In [4]:
cd_train.isnull().sum().sum(),cd_test.isnull().sum().sum() # there are no missing values in the data 

(0, 0)

In [5]:
target='V86'

In [6]:
cd_train[target].value_counts()

0    5474
1     348
Name: V86, dtype: int64

In [7]:
x_train=cd_train.drop(target,1)
y_train=cd_train[target]

In [8]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(fit_intercept=True)
# you can try more complex algorithm such as rf,gbm,svm,xgboost etc 
# to improve performance

In [9]:
params={'penalty':['l1','l2'],
       'C':np.linspace(0.01,100,10),
       'class_weight':['balanced',None]}
# you can try different ranges of parameter C

In [10]:
from sklearn.model_selection import GridSearchCV


In [11]:
gs=GridSearchCV(model,cv=10,param_grid=params,n_jobs=-1,verbose=5,scoring='roc_auc')

In [12]:
gs.fit(x_train,y_train)
# this took around 8-10 mins for me 
# if you chose to build more complex models with higher number of 
# parameters, its going to take much much longer 
# using randomised search will be a better idea  

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  8.0min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([1.000e-02, 1.112e+01, 2.223e+01, 3.334e+01, 4.445e+01, 5.556e+01,
       6.667e+01, 7.778e+01, 8.889e+01, 1.000e+02]), 'class_weight': ['balanced', None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=5)

In [13]:
# we here are using default best model
# you can make use of report function and see if you'd like to chose any other rank
# paramter combination

In [14]:
gs.best_estimator_

LogisticRegression(C=0.01, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [15]:
train_score=gs.best_estimator_.predict_proba(x_train)[:,1]

In [16]:
real=y_train

In [17]:
cutoffs=np.linspace(0.001,0.999,999)

In [18]:
from sklearn.metrics import fbeta_score

In [19]:
fbetas=[]

In [20]:
for cutoff in cutoffs:
    
    predicted=(train_score>cutoff).astype(int)
    
    fbetas.append(fbeta_score(y_train,predicted,2))
    

In [21]:
my_cutoff=cutoffs[fbetas==max(fbetas)]

In [22]:
predictions=(gs.predict_proba(cd_test)[:,1]>my_cutoff).astype(int)

In [23]:
pd.Series(predictions).value_counts()

0    3012
1     988
dtype: int64

In [24]:
submissions=pd.DataFrame({'V86':predictions})

In [26]:
submissions.to_csv('sample_submission.csv',index=False)