In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import Normalizer
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [20]:
train_df = pd.read_csv('train_smoker.csv')
test_df = pd.read_csv('test_smoker.csv')

In [21]:
print(train_df.shape)
print(test_df.shape)

(159256, 24)
(106171, 23)


In [22]:
pd.set_option('display.max_columns', None)
train_df.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,87,94,172,300,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,83,147,194,55,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,75,79,178,197,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,88,91,180,203,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,76,91,155,87,44,93,15.4,1,0.8,19,13,17,0,1


In [23]:
features = train_df.drop(['id','smoking'],axis=1)
y = train_df['smoking']

In [33]:
test_df_features = test_df.drop(['id'],axis=1)

In [25]:
x_train, x_test, y_train, y_test = train_test_split(features,y,test_size=0.2,random_state=7)

In [8]:
X = StandardScaler().fit_transform(x_train)
X_test = StandardScaler().fit_transform(x_test)

In [33]:
paramsRidge = {'alpha': [0.01,0.1,0.2,0.5,0.8,1],
               'solver':['auto','svd','cholesky','lsqr','sag','saga']}
ridge = Ridge()
ridge_cv = GridSearchCV(ridge,paramsRidge,cv=4)
ridge_cv.fit(X,y_train)

In [34]:
print('Best params are: ', ridge_cv.best_params_)
print('Best accuracy is: ', ridge_cv.best_score_)

Best params are:  {'alpha': 0.01, 'solver': 'svd'}
Best accuracy is:  0.2989465529001938


In [26]:
paramsXGB = {
             'learning_rate':[0.1],
             
             'booster':['gbtree','dart','gblinear']}

xgb = XGBClassifier()
xgb_cv = GridSearchCV(xgb,paramsXGB,cv=3)
xgb_cv.fit(X,y_train)

In [27]:
y_hat = xgb_cv.predict(X_test)

In [28]:
score = roc_auc_score(y_test,y_hat)
score

0.776404439769957

In [34]:
prediction = xgb_cv.predict_proba(test_df_features)[:,1]

In [35]:
output = pd.DataFrame({'id':test_df['id'],'smoking':prediction})

In [36]:
output.head()

Unnamed: 0,id,smoking
0,159256,0.424173
1,159257,0.424173
2,159258,0.3749
3,159259,0.43333
4,159260,0.424173


In [37]:
output.to_csv('submission.csv',index=False)