# Load and observe data

Firstly, import necessary modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

Read data and concat them into one df

In [2]:
df_train = pd.read_csv(
    './data/train.csv',
    #dtype={'Field_34': float}
)
df_test = pd.read_csv('./data/test.csv')

df = pd.concat([df_train, df_test])

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


Get some information about data

In [32]:
df_train.describe()

Unnamed: 0,id,label,Field_3,Field_10,Field_13,Field_14,Field_16,Field_17,Field_19,Field_20,...,partner5_G,partner5_H,partner5_K,partner5_L,num_of_phone,Field_78,Field_79,Field_80,Field_81,Field_82
count,53030.0,53030.0,25564.0,25564.0,25564.0,25564.0,25564.0,25564.0,25564.0,25564.0,...,48283.0,48283.0,48283.0,48283.0,50518.0,36397.0,27782.0,36320.0,35981.0,53030.0
mean,26514.5,0.316726,1.538687,1.096542,1.0,1.0,1.0,1.0,0.267022,3603181.0,...,4.1e-05,0.0,0.0,0.0,1.087236,20.80144,30.911307,-1.793163,-32.149412,1.668659
std,15308.586724,0.465204,0.498511,0.332593,0.0,0.0,0.0,0.0,0.835968,2282034.0,...,0.006436,0.0,0.0,0.0,0.297815,40.10993,35.968215,32.176041,63.442452,1.09091
min,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-267.0,0.0,-267.0,-290.0,1.0
25%,13257.25,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1490000.0,...,0.0,0.0,0.0,0.0,1.0,4.0,5.326888,-5.6,-69.0,1.0
50%,26514.5,0.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,3801000.0,...,0.0,0.0,0.0,0.0,1.0,16.0,15.556349,2.086957,0.0,1.0
75%,39771.75,1.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,4500000.0,...,0.0,0.0,0.0,0.0,1.0,36.0,46.013087,9.5,1.0,2.0
max,53029.0,1.0,2.0,5.0,1.0,1.0,1.0,1.0,6.864,29800000.0,...,1.0,0.0,0.0,0.0,5.0,242.0,325.976226,238.0,238.0,4.0


In [33]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53030 entries, 0 to 53029
Columns: 195 entries, id to Field_82
dtypes: float64(133), int64(3), object(59)
memory usage: 78.9+ MB


In [34]:
df_train['label'].value_counts()

0    36234
1    16796
Name: label, dtype: int64

We can see that data is imbalance

# Pre-process data

In [3]:
string_fields = [
    'Field_12',
    'Field_18',
    'Field_46',
    'Field_47',
    'Field_48',
    'Field_49',
    'Field_61',
    'Field_62',
    'Field_65',
    'Field_66',
    'maCv',
    'data.basic_info.locale',
    'currentLocationCity',
    'currentLocationCountry',
    'currentLocationName',
    'currentLocationState',
    'currentLocationState',
    'homeTownCity',
    'homeTownCountry',
    'homeTownName',
    'homeTownState',
    'brief'
]

In [4]:
def getNullCounts(df_temp):
    null_counts = pd.DataFrame(df_temp.isnull().sum(axis = 0))
    null_counts.reset_index(inplace = True)
    null_counts.columns = ['Field', 'Number of null']
    null_counts['% null'] = null_counts['Number of null'] / len(list(df_temp.index))
    null_counts.sort_values(
        ['Number of null'], 
        ascending = False,
        axis = 0,
        inplace = True
    )
    return null_counts

def getBadFields(df_temp):
    null_counts = getNullCounts(df_temp)
    bad_fields = null_counts['Field'][null_counts['% null'] > 0.7]
    return list(bad_fields)

def dropBadFields(df_temp):
    bad_fields = getBadFields(df_temp)
    # Drop all except fields in string_fields
    bad_fields = [item for item in bad_fields if item not in string_fields]
    return df_temp.drop(
        bad_fields,
        axis = 1,
    )
    

In [5]:
def dropObjectFields(df_temp):
    object_fields = df_temp.loc[:, list(df_temp.dtypes == object)].columns
    object_fields = list(object_fields)
    # Drop all except fields in string_fields
    object_fields = [item for item in object_fields if item not in string_fields]
    return df_temp.drop(
        object_fields,
        axis = 1,
    )
    

In [6]:
def cleanFields(df_temp):
    df_temp['Field_18'] = df_temp['Field_18'].replace(['.', ','], np.nan)
    df_temp['brief'] = df_temp['brief'].replace(['notfound'], np.nan)
    return df_temp

In [7]:
def preProcess(df_temp):
    df_temp = dropBadFields(df_temp)
    df_temp = dropObjectFields(df_temp)
    df_temp = cleanFields(df_temp)
    df_temp = df_temp.drop(['id'], axis = 1)
    df_temp = df_temp.drop(['ngaySinh', 'namSinh'], axis = 1)
    
    #df_temp = df_temp.fillna(0)
    df_temp = pd.get_dummies(df_temp, dummy_na = True)
    
    return df_temp

In [10]:
preProcess(df)

Unnamed: 0,label,Field_3,Field_10,Field_12,Field_13,Field_14,Field_16,Field_17,Field_18,Field_19,...,partner5_H,partner5_K,partner5_L,brief,num_of_phone,Field_78,Field_79,Field_80,Field_81,Field_82
0,1.0,1.0,1.0,G8,1.0,1.0,1.0,1.0,Trung tâm Kinh doanh tiền mặt,0.000,...,0.0,0.0,0.0,cb1,1.0,0.0,0.000000,0.000000,0.0,1
1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.000,...,0.0,0.0,0.0,4,1.0,0.0,0.000000,0.000000,0.0,1
2,0.0,2.0,1.0,0,1.0,1.0,1.0,1.0,0,0.000,...,0.0,0.0,0.0,1,1.0,33.0,10.769445,6.466667,0.0,2
3,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.000,...,0.0,0.0,0.0,1,1.0,33.0,10.769445,6.466667,0.0,3
4,1.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.000,...,0.0,0.0,0.0,1,1.0,33.0,10.769445,6.466667,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20376,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.000,...,0.0,0.0,0.0,1,1.0,24.0,42.644640,-10.722222,-160.0,3
20377,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.000,...,0.0,0.0,0.0,2,1.0,44.0,13.740300,6.562500,0.0,3
20378,0.0,1.0,1.0,0,1.0,1.0,1.0,1.0,0,4.136,...,0.0,0.0,0.0,0,1.0,0.0,0.000000,0.000000,0.0,1
20379,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.000,...,0.0,0.0,0.0,0,1.0,0.0,0.000000,0.000000,0.0,1


# Create and train model

In [49]:
#skf = StratifiedKFold(n_splits = 5, random_state=3462873, shuffle=True)
#df_train = df_train.sample(frac = 1, random_state = 1).reset_index(drop = True)
df_train = df[df['id'] < 53030]
df_test = df[df['id'] >= 53030]

In [51]:
X_train = df_train.drop(columns=['label'])
Y_train = df_train['label']
print(X_train.shape)
print(Y_train.shape)

(53030, 134)
(53030,)


In [52]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [53]:
X_test = preProcess(df_test)
print(X_test.shape)

(20381, 134)


In [55]:
Y_test = lr.predict_proba(X_test)[:, 1]
Y_test

array([0.5       , 0.5       , 0.5       , ..., 0.25379198, 0.5       ,
       0.24730212])

In [56]:
res_df = pd.DataFrame({'id': df_test['id'], 'label': Y_test})
res_df.to_csv('submission.csv', index=False)