In [1]:
import pandas as pd
import numpy as np

### Reading Test and Train Files

In [2]:
train_file =r'Data\Consumer_Complaints_train.csv'
test_file = r'Data\Consumer_Complaints_test_share.csv'

train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
print(train.shape)
print(test.shape)

(478421, 18)
(119606, 17)


In [3]:
train.isnull().sum()

Date received                        0
Product                              0
Sub-product                     138473
Issue                                0
Sub-issue                       292625
Consumer complaint narrative    403327
Company public response         388029
Company                              0
State                             3839
ZIP code                          3848
Tags                            411215
Consumer consent provided?      342934
Submitted via                        0
Date sent to company                 0
Company response to consumer         0
Timely response?                     0
Consumer disputed?                   0
Complaint ID                         0
dtype: int64

In [4]:
train.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2014-05-15,Credit card,,Billing statement,,,,Wells Fargo & Company,MI,48342,Older American,,Web,2014-05-16,Closed with explanation,Yes,No,856103
1,2014-09-18,Bank account or service,(CD) Certificate of deposit,"Making/receiving payments, sending money",,,,Santander Bank US,PA,18042,,,Referral,2014-09-24,Closed,Yes,No,1034666
2,2014-03-13,Credit reporting,,Incorrect information on credit report,Account status,,,Equifax,CA,92427,,,Referral,2014-04-03,Closed with non-monetary relief,Yes,No,756363
3,2015-07-17,Credit card,,Billing statement,,"My credit card statement from US Bank, XXXX. X...",Company chooses not to provide a public response,U.S. Bancorp,GA,305XX,Older American,Consent provided,Web,2015-07-17,Closed with monetary relief,Yes,No,1474177
4,2014-11-20,Credit card,,Transaction issue,,,,Bank of America,MA,02127,,,Web,2014-11-28,Closed with explanation,Yes,No,1132572


In [5]:
train['Consumer consent provided?'].unique()

array([nan, 'Consent provided', 'Other', 'Consent not provided',
       'Consent withdrawn'], dtype=object)

In [6]:
# dropping column consumer consent provided
train.drop(['Consumer consent provided?'],inplace=True,axis=1)
test.drop(['Consumer consent provided?'],inplace=True,axis=1)

In [7]:
# dropping column Tags
train.drop(['Tags'],inplace=True,axis=1)
test.drop(['Tags'],inplace=True,axis=1)

In [8]:
train.isnull().sum()

Date received                        0
Product                              0
Sub-product                     138473
Issue                                0
Sub-issue                       292625
Consumer complaint narrative    403327
Company public response         388029
Company                              0
State                             3839
ZIP code                          3848
Submitted via                        0
Date sent to company                 0
Company response to consumer         0
Timely response?                     0
Consumer disputed?                   0
Complaint ID                         0
dtype: int64

#### imputing sub-product column

In [9]:
unique_product = train['Product'].unique()
for i in unique_product:
    print(i)
    condition = (train['Product']==i) & (train['Sub-product'].isnull())
    cols = ['Sub-product']
    print(train.loc[condition,cols].shape)
    print(train[train['Product']==i].shape)
    print('-------------------------------')

Credit card
(57358, 1)
(57358, 16)
-------------------------------
Bank account or service
(0, 1)
(54403, 16)
-------------------------------
Credit reporting
(81115, 1)
(81115, 16)
-------------------------------
Mortgage
(0, 1)
(156175, 16)
-------------------------------
Debt collection
(0, 1)
(86544, 16)
-------------------------------
Student loan
(0, 1)
(14918, 16)
-------------------------------
Consumer Loan
(0, 1)
(18599, 16)
-------------------------------
Money transfers
(0, 1)
(3349, 16)
-------------------------------
Prepaid card
(0, 1)
(2226, 16)
-------------------------------
Payday loan
(0, 1)
(3219, 16)
-------------------------------
Other financial service
(0, 1)
(507, 16)
-------------------------------
Virtual currency
(0, 1)
(8, 16)
-------------------------------


only credit card and credit reporting has no sub product, so imputing sub-product of credit card as credit card

In [10]:
missing_sub_prd_cols = ['Credit reporting','Credit card']
for i in missing_sub_prd_cols:
    condition = (train['Product']==i) & (train['Sub-product'].isnull())
    cols = ['Sub-product'] 
    train.loc[condition,cols] = i
for i in missing_sub_prd_cols:
    condition = (test['Product']==i) & (test['Sub-product'].isnull())
    cols = ['Sub-product'] 
    test.loc[condition,cols] = i

In [11]:
train.isnull().sum()

Date received                        0
Product                              0
Sub-product                          0
Issue                                0
Sub-issue                       292625
Consumer complaint narrative    403327
Company public response         388029
Company                              0
State                             3839
ZIP code                          3848
Submitted via                        0
Date sent to company                 0
Company response to consumer         0
Timely response?                     0
Consumer disputed?                   0
Complaint ID                         0
dtype: int64

In [12]:
test.isnull().sum()

Date received                        0
Product                              0
Sub-product                          0
Issue                                0
Sub-issue                        73060
Consumer complaint narrative    101049
Company public response          96830
Company                              0
State                              925
ZIP code                           926
Submitted via                        1
Date sent to company                 0
Company response to consumer         0
Timely response?                     0
Complaint ID                         0
dtype: int64

#### Issue column

In [13]:
train['Issue'].unique()

array(['Billing statement', 'Making/receiving payments, sending money',
       'Incorrect information on credit report', 'Transaction issue',
       'Loan modification,collection,foreclosure',
       'Loan servicing, payments, escrow account',
       'Credit card protection / Debt protection',
       'Deposits and withdrawals',
       "Cont'd attempts collect debt not owed", 'Getting a loan', 'Other',
       'Account opening, closing, or management',
       "Credit reporting company's investigation",
       'Improper use of my credit report', 'Managing the loan or lease',
       'Communication tactics', 'False statements or representation',
       'Dealing with my lender or servicer',
       'Problems caused by my funds being low', 'Delinquent account',
       'Identity theft / Fraud / Embezzlement',
       'Other transaction issues', 'Disclosure verification of debt',
       'Balance transfer fee', 'Using a debit or ATM card',
       'Improper contact or sharing of info',
       'Appl

In [14]:
condition = (train['Issue'].isnull())
cols = ['Issue']
train.loc[condition,cols] = ''

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize
lemma = WordNetLemmatizer()
my_stop = set(stopwords.words('english')+list(punctuation))

In [16]:
def split_into_lemmas(message):
    message = message.lower()
    words = word_tokenize(message)
    words_sans_stop = []
    for word in words:
        if word in my_stop:
            continue
        words_sans_stop.append(word)
    return[lemma.lemmatize(word) for word in words_sans_stop]

In [17]:
tfidf = TfidfVectorizer(analyzer = split_into_lemmas,
                        min_df=0.1,
                        max_df=0.8,
                        stop_words=my_stop)
cv = CountVectorizer()

In [18]:
x_issue = tfidf.fit(train['Issue'])
train_tfidf = x_issue.transform(train['Issue'])
test_tfidf = x_issue.transform(test['Issue'])

train_features1 = pd.DataFrame(train_tfidf.toarray())
test_features1 = pd.DataFrame(test_tfidf.toarray())

In [19]:
train_features1.shape

(478421, 13)

#### Sub-issue

In [20]:
help(TfidfVectorizer)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  TfidfVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.float64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
 |  
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |  
 |  Equivalent to :class:`CountVectorizer` followed by
 |  :class:`TfidfTransformer`.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : string {'filename', 'file', 'content'}
 |      If 'filename', the sequence passed as an argument to fit is
 |      expected to be a list of filenames that need reading to fetch
 |      the raw con

In [21]:
condition_train = (train['Sub-issue'].isnull())
condition_test= (test['Sub-issue'].isnull())
cols = ['Sub-issue']
train.loc[condition_train,cols] = ''
test.loc[condition_test,cols] = ''

In [22]:
train.isnull().sum()

Date received                        0
Product                              0
Sub-product                          0
Issue                                0
Sub-issue                            0
Consumer complaint narrative    403327
Company public response         388029
Company                              0
State                             3839
ZIP code                          3848
Submitted via                        0
Date sent to company                 0
Company response to consumer         0
Timely response?                     0
Consumer disputed?                   0
Complaint ID                         0
dtype: int64

In [23]:
x_sub_issue = tfidf.fit(train['Sub-issue'])
train_tfidf = x_sub_issue.transform(train['Sub-issue'])
test_tfidf = x_sub_issue.transform(test['Sub-issue'])

train_features2 = pd.DataFrame(train_tfidf.toarray())
test_features2 = pd.DataFrame(test_tfidf.toarray())

In [24]:
train_features2.shape

(478421, 1)

### Consumer complaint narrative

In [25]:
condition_train = (train['Consumer complaint narrative'].isnull())
condition_test= (test['Consumer complaint narrative'].isnull())
cols = ['Consumer complaint narrative']
train.loc[condition_train,cols] = ''
test.loc[condition_test,cols] = ''

In [26]:
x_complaint = tfidf.fit(train['Consumer complaint narrative'])
train_tfidf = x_complaint.transform(train['Consumer complaint narrative'])
test_tfidf = x_complaint.transform(test['Consumer complaint narrative'])

train_features3 = pd.DataFrame(train_tfidf.toarray())
test_features3 = pd.DataFrame(test_tfidf.toarray())

In [27]:
train_features3.shape

(478421, 1)

In [28]:
train=train.reset_index()
test=test.reset_index()

In [29]:
# Adding the new features to train and test tabels
final_train = pd.concat([train,train_features1,train_features2,train_features3],axis=1)
final_test = pd.concat([test,test_features1,test_features2,test_features3],axis=1)

In [30]:
final_train.shape

(478421, 32)

In [31]:
# dropping complaint id column from both test and train
final_test.drop(['Complaint ID'],inplace=True,axis=1)
final_train.drop(['Complaint ID'],inplace=True,axis=1)

In [32]:
# dropping Company public response column from both test and train
final_test.drop(['Company public response'],inplace=True,axis=1)
final_train.drop(['Company public response'],inplace=True,axis=1)

In [33]:
final_train.head()

Unnamed: 0,index,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company,State,ZIP code,...,5,6,7,8,9,10,11,12,0,0.1
0,0,2014-05-15,Credit card,Credit card,Billing statement,,,Wells Fargo & Company,MI,48342,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2014-09-18,Bank account or service,(CD) Certificate of deposit,"Making/receiving payments, sending money",,,Santander Bank US,PA,18042,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2,2014-03-13,Credit reporting,Credit reporting,Incorrect information on credit report,Account status,,Equifax,CA,92427,...,0.0,0.520699,0.520737,0.0,0.0,0.0,0.511572,0.0,0.0,0.0
3,3,2015-07-17,Credit card,Credit card,Billing statement,,"My credit card statement from US Bank, XXXX. X...",U.S. Bancorp,GA,305XX,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,2014-11-20,Credit card,Credit card,Transaction issue,,,Bank of America,MA,02127,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
final_train.isnull().sum()

index                              0
Date received                      0
Product                            0
Sub-product                        0
Issue                              0
Sub-issue                          0
Consumer complaint narrative       0
Company                            0
State                           3839
ZIP code                        3848
Submitted via                      0
Date sent to company               0
Company response to consumer       0
Timely response?                   0
Consumer disputed?                 0
0                                  0
1                                  0
2                                  0
3                                  0
4                                  0
5                                  0
6                                  0
7                                  0
8                                  0
9                                  0
10                                 0
11                                 0
1

In [35]:
# dropping issue , Sub-issue and Consumer complaint narrative
cols = ['Issue','Sub-issue','Consumer complaint narrative']
final_train.drop(cols,inplace=True,axis=1)
final_test.drop(cols,inplace=True,axis=1)

In [36]:
final_train.shape

(478421, 27)

converting date comlumns to datetime type

In [37]:
date_cols = ['Date sent to company','Date received']
for i in date_cols:
    final_train[i] = pd.to_datetime(final_train[i])
    final_test[i] = pd.to_datetime(final_test[i])

In [38]:
final_train['days_diff'] = final_train['Date sent to company'] - final_train['Date received']

final_train['days_diff'] = final_train['days_diff'].astype(str)

In [39]:
final_train['days_diff'] = final_train['days_diff'].str.replace("days",'')

final_train['days_diff'] = final_train['days_diff'].str.replace(":00",'')
final_train['days_diff'] = final_train['days_diff'].str.replace("  ",'')

final_train['days_diff'] = final_train['days_diff'].str.replace("00.000000000",'')

final_train['days_diff'] = final_train['days_diff'].str.replace("+",'')
condition_final_train = (final_train['days_diff']=='NaT')
final_train.loc[condition_final_train,['days_diff']] = 0
final_train['days_diff'] = pd.to_numeric(final_train['days_diff'])

final_train = final_train[final_train['days_diff']>=0]

In [40]:
final_test['days_diff'] = final_test['Date sent to company'] - final_test['Date received']
final_test['days_diff'] = final_test['days_diff'].astype(str)
final_test['days_diff'] = final_test['days_diff'].str.replace("days",'')

final_test['days_diff'] = final_test['days_diff'].str.replace(":00",'')
final_test['days_diff'] = final_test['days_diff'].str.replace("  ",'')

final_test['days_diff'] = final_test['days_diff'].str.replace("00.000000000",'')

final_test['days_diff'] = final_test['days_diff'].str.replace("+",'')

condition_final_test = (final_test['days_diff']=='NaT')
final_test.loc[condition_final_test,['days_diff']] = 0
final_test['days_diff'] = pd.to_numeric(final_test['days_diff'])


condition_final_test = final_test['days_diff']<=0
final_test.loc[condition_final_test,['days_diff']]=0


In [41]:
final_train.isnull().sum()

index                              0
Date received                      0
Product                            0
Sub-product                        0
Company                            0
State                           3828
ZIP code                        3837
Submitted via                      0
Date sent to company               0
Company response to consumer       0
Timely response?                   0
Consumer disputed?                 0
0                                  0
1                                  0
2                                  0
3                                  0
4                                  0
5                                  0
6                                  0
7                                  0
8                                  0
9                                  0
10                                 0
11                                 0
12                                 0
0                                  0
0                                  0
d

In [42]:
final_test.isnull().sum()

index                             0
Date received                     0
Product                           0
Sub-product                       0
Company                           0
State                           925
ZIP code                        926
Submitted via                     1
Date sent to company              0
Company response to consumer      0
Timely response?                  0
0                                 0
1                                 0
2                                 0
3                                 0
4                                 0
5                                 0
6                                 0
7                                 0
8                                 0
9                                 0
10                                0
11                                0
12                                0
0                                 0
0                                 0
days_diff                         0
dtype: int64

In [43]:
# droppig columns Company and ZIP code since there are lot of unique values
for col in ['ZIP code','Company']:
    final_train.drop([col],1,inplace=True)
    final_test.drop([col],1,inplace=True)

In [44]:
# dropping the date cols and company public response
final_train.drop(['Date received','Date sent to company'],inplace=True,axis=1)
final_test.drop(['Date received','Date sent to company'],inplace=True,axis=1)

In [45]:
final_train.dtypes

index                             int64
Product                          object
Sub-product                      object
State                            object
Submitted via                    object
Company response to consumer     object
Timely response?                 object
Consumer disputed?               object
0                               float64
1                               float64
2                               float64
3                               float64
4                               float64
5                               float64
6                               float64
7                               float64
8                               float64
9                               float64
10                              float64
11                              float64
12                              float64
0                               float64
0                               float64
days_diff                         int64
dtype: object

In [46]:
final_test.dtypes

index                             int64
Product                          object
Sub-product                      object
State                            object
Submitted via                    object
Company response to consumer     object
Timely response?                 object
0                               float64
1                               float64
2                               float64
3                               float64
4                               float64
5                               float64
6                               float64
7                               float64
8                               float64
9                               float64
10                              float64
11                              float64
12                              float64
0                               float64
0                               float64
days_diff                         int64
dtype: object

In [47]:
final_train['Consumer disputed?'] = (final_train['Consumer disputed?']=='Yes').astype(int)

In [48]:
cols = final_train.select_dtypes(['object'])

In [49]:
for i in cols:
    print(i)
    print("-------------------")
    print(final_train[i].value_counts())
    print("=====================================================")

Product
-------------------
Mortgage                   154979
Debt collection             85157
Credit reporting            79787
Credit card                 56435
Bank account or service     54091
Consumer Loan               18487
Student loan                14737
Money transfers              3286
Payday loan                  3202
Prepaid card                 2226
Other financial service       507
Virtual currency                8
Name: Product, dtype: int64
Sub-product
-------------------
Credit reporting                          79787
Credit card                               74063
Other mortgage                            61399
Conventional fixed mortgage               47485
Checking account                          38080
Other (i.e. phone, health club, etc.)     25337
I do not know                             17884
Conventional adjustable mortgage (ARM)    17347
FHA mortgage                              16094
Non-federal student loan                  15069
Medical                 

In [50]:
for i in cols:
    print(i)
    print("-------------------")
    print(final_test[i].value_counts())

    print("=====================================================")

Product
-------------------
Mortgage                   38904
Debt collection            21503
Credit reporting           20497
Credit card                14255
Bank account or service    13691
Consumer Loan               4664
Student loan                3717
Payday loan                  829
Money transfers              827
Prepaid card                 587
Other financial service      128
Virtual currency               4
Name: Product, dtype: int64
Sub-product
-------------------
Credit reporting                          20497
Credit card                               18626
Other mortgage                            15241
Conventional fixed mortgage               12099
Checking account                           9721
Other (i.e. phone, health club, etc.)      6444
I do not know                              4473
Conventional adjustable mortgage (ARM)     4434
FHA mortgage                               3992
Non-federal student loan                   3834
Medical                             

In [51]:
pd.set_option('display.max_columns',60)
final_train.head()

Unnamed: 0,index,Product,Sub-product,State,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,0,1,2,3,4,5,6,7,8,9,10,11,12,0.1,0.2,days_diff
0,0,Credit card,Credit card,MI,Web,Closed with explanation,Yes,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1,Bank account or service,(CD) Certificate of deposit,PA,Referral,Closed,Yes,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6
2,2,Credit reporting,Credit reporting,CA,Referral,Closed with non-monetary relief,Yes,0,0.0,0.0,0.442718,0.0,0.0,0.0,0.520699,0.520737,0.0,0.0,0.0,0.511572,0.0,0.0,0.0,21
3,3,Credit card,Credit card,GA,Web,Closed with monetary relief,Yes,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,4,Credit card,Credit card,MA,Web,Closed with explanation,Yes,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8


In [52]:
final_train['Timely response?'] = (final_train['Timely response?']=='Yes').astype(int)
final_test['Timely response?'] = (final_test['Timely response?']=='Yes').astype(int)

In [53]:
k=final_train['State'].value_counts()
for val in k.axes[0][0:15]:
    varname='State_'+val.replace(',','_').replace(' ','_')
    final_train[varname]=np.where(final_train['State']==val,1,0)
    final_test[varname]=np.where(final_test['State']==val,1,0)
del final_train['State']
del final_test['State']

In [54]:
for col in ['Sub-product']:
    varname=col.replace('-','_').replace('?','').replace(" ",'_')+'_isNan'
    final_train[varname]=np.where(pd.isnull(final_train[col]),1,0)
    final_train.drop([col],1,inplace=True)
    final_test[varname]=np.where(pd.isnull(final_test[col]),1,0)
    final_test.drop([col],1,inplace=True)

In [55]:
for col in ['Product','Submitted via','Company response to consumer']:
    
    temp=pd.get_dummies(final_train[col],prefix=col,drop_first=True)
    final_train=pd.concat([temp,final_train],1)
    final_train.drop([col],1,inplace=True)
    
    temp=pd.get_dummies(final_test[col],prefix=col,drop_first=True)
    final_test=pd.concat([temp,final_test],1)
    final_test.drop([col],1,inplace=True)

In [56]:
final_train.shape

(472902, 57)

In [57]:
final_test.shape

(119606, 56)

In [58]:
final_train['Consumer disputed?'].value_counts()

0    372676
1    100226
Name: Consumer disputed?, dtype: int64

# Splitting train into features and target

In [59]:
x=final_train.drop(['Consumer disputed?'],axis=1)
y=final_train['Consumer disputed?']

# PCA

In [60]:
from sklearn.decomposition import PCA

In [61]:
pca = PCA(n_components=25,random_state=2)
principalComponents = pca.fit_transform(x)

test = pca.transform(final_test)

# Over sampling with SMOTE

In [62]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [63]:
x_s, y_s = SMOTE(sampling_strategy="not majority", random_state=2,
                                k_neighbors=3).fit_resample(principalComponents, y)

In [64]:
y_s

array([0, 0, 0, ..., 1, 1, 1])

In [65]:
x_train,x_test,y_train,y_test = train_test_split(x_s,y_s,test_size = 0.4,random_state = 2)

### froming test and train from train

In [66]:
# from sklearn.model_selection import train_test_split

In [74]:
x_train_no_s,x_test_no_s,y_train_no_s,y_test_no_s = train_test_split(principalComponents,y,test_size = 0.3,random_state = 2)

In [68]:
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.metrics import roc_auc_score

In [69]:
help(LogisticRegression())

Help on LogisticRegression in module sklearn.linear_model.logistic object:

class LogisticRegression(sklearn.base.BaseEstimator, sklearn.linear_model.base.LinearClassifierMixin, sklearn.linear_model.base.SparseCoefMixin)
 |  LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None)
 |  
 |  Logistic Regression (aka logit, MaxEnt) classifier.
 |  
 |  In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
 |  scheme if the 'multi_class' option is set to 'ovr', and uses the cross-
 |  entropy loss if the 'multi_class' option is set to 'multinomial'.
 |  (Currently the 'multinomial' option is supported only by the 'lbfgs',
 |  'sag' and 'newton-cg' solvers.)
 |  
 |  This class implements regularized logistic regression using the
 |  'liblinear' library, 'newton-cg', 'sag' and 'lbfgs' solvers. It 

In [70]:
l=LogisticRegression(fit_intercept=True,solver ='saga',n_jobs=-1,verbose=True,max_iter=2000)

In [71]:
l.fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


max_iter reached after 793 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 13.2min finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=-1, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=True, warm_start=False)

In [72]:
l.score

<bound method ClassifierMixin.score of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=-1, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=True, warm_start=False)>

In [75]:
pred_test = l.predict(x_test_no_s)

In [76]:
pred_test

array([0, 1, 1, ..., 1, 0, 1])

In [78]:
roc_auc_score(pred_test,y_test_no_s)

0.5183119013811687

# logistic using grid search CV

In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import GridSearchCV

In [94]:
params={'penalty':['l1','l2'],
        'C':np.linspace(0.1,0.3,10)}

In [95]:
model=LogisticRegression(fit_intercept=True)#,class_weight={0:0.2,1:0.8})

In [96]:
grid_search=GridSearchCV(model,param_grid=params,cv=5,scoring="roc_auc",n_jobs=4,verbose = True)

In [97]:
grid_search.fit(x_train,y_train,)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  2.6min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([0.1    , 0.12222, 0.14444, 0.16667, 0.18889, 0.21111, 0.23333,
       0.25556, 0.27778, 0.3    ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=True)

In [104]:
grid_search.best_params_

{'C': 0.18888888888888888, 'penalty': 'l1'}

In [105]:
y_pred = grid_search.predict(x_test_no_s)

In [106]:
roc_auc_score(y_pred,y_test_no_s)

0.5503519289250806

In [113]:
prediction=np.where(grid_search.predict(test)==1,"Yes","No")

pd.DataFrame(prediction).head(4)

pd.DataFrame(prediction).to_csv('sample_submission.csv',index=False)