In [5]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV

import datetime

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline  

In [6]:
train = pd.read_csv('contest_train.csv')
X_test = pd.read_csv('contest_test.csv')

In [10]:
def work(df, X_test):
    df = df.drop(columns=['FEATURE_3', 'FEATURE_144', 'FEATURE_249', 'FEATURE_256'])
    X_test = X_test.drop(columns=['FEATURE_3', 'FEATURE_144', 'FEATURE_249', 'FEATURE_256'])
    
    #binary
    tmp = pd.DataFrame(df.describe().loc['max'] == 1)
    binary = [i for i in tmp[tmp['max'] == 1].index if i != 'TARGET' and i != 'ID'] 
    df = pd.get_dummies(df, columns = binary)
    X_test = pd.get_dummies(X_test, columns = binary)
    
    #multiclass
    tmp = pd.DataFrame(df.describe().loc['std'] < 1)
    multiclass = [i for i in tmp[tmp['std']== True].index if i not in binary and i != 'TARGET' and i != 'ID']
    df = pd.get_dummies(df, columns = multiclass)
    df = df.dropna()
    #X_test[X_test.columns] = X_test[X_test.columns].fillna(0)

    #numeric
    numeric = [i for i in df.columns if i not in binary and i not in multiclass and i != 'TARGET' and i != 'ID']
    scale = StandardScaler().fit(df[numeric].as_matrix())
    df[numeric] = scale.fit_transform(df[numeric].as_matrix())
    X_test[numeric] = scale.transform(X_test[numeric].as_matrix())
    
    return df[[i for i in df.columns if i != 'TARGET']].as_matrix(), df['TARGET'].as_matrix(), X_test

In [11]:
def Testing_grid_reg(X_train, Y_train, c=[10.0 ** i for i in range(-8, 0)]):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    
    clf = LogisticRegression(class_weight = 'balanced', n_jobs=1)
    
    grid=  {'C': c,
            'penalty' : ['l2', 'l1'],
            'max_iter' : [50, 80]
           }
    
    start_time = datetime.datetime.now()
    gs = GridSearchCV(clf, scoring='f1_micro', 
                      param_grid=grid, cv=cv, 
                      return_train_score=True, n_jobs=-1, verbose = True)
    gs.fit(X_train,Y_train)
    
    print ('Time elapsed:', datetime.datetime.now() - start_time)
    print (max(gs.cv_results_['mean_test_score']))
    
    return gs.best_params_

In [None]:
X_train, y_train, X_test = work(train, X_test)
#Testing_grid_reg(X_train, y_train)

In [None]:
clf = LogisticRegression(class_weight = 'balanced', n_jobs=-1,
                         C=10 ** -6 , penalty='l1', max_iter=50)

In [None]:
clf.fit(X_train, y_train)
predict =  pd.DataFrame(clf.predict(X_test), columns=['TARGET'])
proba = pd.DataFrame(clf.predict_proba(X_test), columns=['PROB'])
pd.concat([X_test, predict, proba]).to_csv('res.csv')