## import, read and introduction

In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import random

### Read train and test data files

In [4]:
df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [5]:
df.shape

(297606, 59)

## Preprocessing

### Check for missing values

In [6]:
df.isnull().sum()

id                       0
CARTYPE_07_CAT        5727
METRIC_18_BIN            0
METRIC_04                0
CONDITION_11_BIN         0
REGISTRATION_03      53709
CONDITION_05_CAT      2891
CARTYPE_15               0
METRIC_09                0
METRIC_01                0
REGISTRATION_01          0
CONDITION_14             0
METRIC_12                0
METRIC_08                0
METRIC_06                0
CONDITION_02_CAT       104
METRIC_10                0
CARTYPE_10_CAT           0
CONDITION_09_BIN         0
CARTYPE_03_CAT      205287
target                   0
CARTYPE_09_CAT         282
CARTYPE_11               2
CARTYPE_01_CAT          57
CONDITION_13_BIN         0
METRIC_11                0
METRIC_14                0
METRIC_17_BIN            0
CARTYPE_13               0
CARTYPE_06_CAT           0
METRIC_15_BIN            0
CONDITION_03             0
METRIC_20_BIN            0
METRIC_07                0
CARTYPE_02_CAT           2
CONDITION_01             0
CONDITION_18_BIN         0
C

### Droping columns with more than 20000 null values

In [7]:
df.drop(['REGISTRATION_03','CARTYPE_05_CAT','CARTYPE_14','CARTYPE_03_CAT'],axis=1,inplace=True)

In [8]:
df.isnull().sum()

id                     0
CARTYPE_07_CAT      5727
METRIC_18_BIN          0
METRIC_04              0
CONDITION_11_BIN       0
CONDITION_05_CAT    2891
CARTYPE_15             0
METRIC_09              0
METRIC_01              0
REGISTRATION_01        0
CONDITION_14           0
METRIC_12              0
METRIC_08              0
METRIC_06              0
CONDITION_02_CAT     104
METRIC_10              0
CARTYPE_10_CAT         0
CONDITION_09_BIN       0
target                 0
CARTYPE_09_CAT       282
CARTYPE_11             2
CARTYPE_01_CAT        57
CONDITION_13_BIN       0
METRIC_11              0
METRIC_14              0
METRIC_17_BIN          0
CARTYPE_13             0
CARTYPE_06_CAT         0
METRIC_15_BIN          0
CONDITION_03           0
METRIC_20_BIN          0
METRIC_07              0
CARTYPE_02_CAT         2
CONDITION_01           0
CONDITION_18_BIN       0
CONDITION_16_BIN       0
CONDITION_07_BIN       0
CONDITION_17_BIN       0
CARTYPE_11_CAT         0
CONDITION_10_BIN       0


### Dropping the unique identification column

In [9]:
df = df.drop(['id'],axis=1) # removing id from the data of computation as it is not related to prediction

### Dropping rows with na values

In [10]:
# df = df.dropna()
df=df.fillna(method='ffill')

In [11]:
for i in list(df.isnull().sum()):
 if(i != 0):
   print("problem")

### Check for question marks

In [12]:
question_marks = list((df == "?").sum())
for i in question_marks:
  if i !=0 :
    print("problem")

### Check for duplicated rows

In [13]:
df.duplicated().sum()

0

There are no duplicate rows.

### Categorical features

In [14]:
df.nunique()


CARTYPE_07_CAT          2
METRIC_18_BIN           2
METRIC_04               6
CONDITION_11_BIN        2
CONDITION_05_CAT        7
CARTYPE_15             15
METRIC_09               8
METRIC_01              10
REGISTRATION_01        10
CONDITION_14            5
METRIC_12              11
METRIC_08              11
METRIC_06              10
CONDITION_02_CAT        4
METRIC_10              26
CARTYPE_10_CAT          3
CONDITION_09_BIN        2
target                  2
CARTYPE_09_CAT          5
CARTYPE_11              4
CARTYPE_01_CAT         12
CONDITION_13_BIN        2
METRIC_11              19
METRIC_14              23
METRIC_17_BIN           2
CARTYPE_13          52532
CARTYPE_06_CAT         18
METRIC_15_BIN           2
CONDITION_03           12
METRIC_20_BIN           2
METRIC_07              10
CARTYPE_02_CAT          2
CONDITION_01            8
CONDITION_18_BIN        2
CONDITION_16_BIN        2
CONDITION_07_BIN        2
CONDITION_17_BIN        2
CARTYPE_11_CAT        104
CONDITION_10

### Encoding

In [15]:
# new_column = pd.cut(df.CARTYPE_13,bins=[0,4.3921793760000005,5.0899999192,5.5847490088,5.9634146304,6.3249734592,6.6978225152,7.0661572663999985,7.4562481424,7.8857293432,8.423108709600001,9.181728172,10.4332589616,100],labels=[0,1,2,3,4,5,6,7,8,9,10,11,12])
# new_column.rename({'CARTYPE_13': 'CARTYPE_13_modified'}, inplace=True)
# df = pd.concat([df,new_column], axis=1, join='inner')
# df.head()

In [16]:
df.drop(['METRIC_06','METRIC_11','METRIC_14','METRIC_10','CARTYPE_12'],axis=1,inplace=True)

# sus = ['CONDITION_12_BIN','CARTYPE_12','CONDITION_10_BIN','CARTYPE_11_CAT','CONDITION_13_BIN','CARTYPE_10_CAT','CONDITION_14','CONDITION_11_BIN']
# df.drop(sus,axis=1,inplace=True)

In [17]:
# column_names_to_one_hot = list(df.columns).remove('target')
# column_names_to_one_hot = column_names_to_one_hot.remove('CARTYPE_13')
# print(column_names_to_one_hot)

for i in df.columns:
    if i =="CARTYPE_13":
        continue
    elif i =="target":
        continue
    else:
        df[i] = df[i].astype('object')

ctype13=df["CARTYPE_13"]
final_X_test = pd.get_dummies(data=df.drop(['CARTYPE_13'],axis=1))
final_X_test["CARTYPE_13"] = ctype13

# df = pd.get_dummies(df, columns=column_names_to_one_hot)
c = final_X_test.columns

## Model generation

### train_test_split_data

In [18]:
# test_X = df.drop(axis="columns",labels=["target"]).to_numpy().astype(np.float64)
# test_y = df['target'].to_numpy().astype(np.float64)

df = final_X_test

test_X = df.drop(axis="columns",labels=["target"]).to_numpy().astype(np.float64)
test_y = df['target'].to_numpy().astype(np.float64)

# test_X = final_X_test.to_numpy().astype(np.float64)
# test_y = y.to_numpy().astype(np.float64)

df['target'].value_counts()
idxes = df[df['target']==0].index
idxes = idxes[:245000]
df = df.drop(idxes)
df['target'].value_counts()

X = df.drop(axis="columns",labels=["target"]).to_numpy().astype(np.float64)
Y = df['target'].to_numpy().astype(np.float64)

shuffler = np.random.permutation(len(X))
train_X = X[shuffler]
train_y = Y[shuffler]

### Logistic regression

In [19]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

lreg = LogisticRegression(solver='liblinear')
lreg.fit(train_X,train_y)

LogisticRegression(solver='liblinear')

### testing

In [20]:
train_yhat = lreg.predict_proba(train_X)
test_yhat = lreg.predict_proba(test_X)

print("training score = ", roc_auc_score(train_y, train_yhat[:,1]))
print("test score = ", roc_auc_score(test_y,test_yhat[:,1]))


training score =  0.6479353724121165
test score =  0.6454385513372383


In [21]:
test_df=test_df.fillna(method='ffill')
test_df.drop(['REGISTRATION_03','CARTYPE_05_CAT','CARTYPE_14','CARTYPE_03_CAT'],axis=1,inplace=True)
# sus = ['CONDITION_12_BIN','CARTYPE_12','CONDITION_10_BIN','CARTYPE_11_CAT','CONDITION_13_BIN','CARTYPE_10_CAT','CONDITION_14','CONDITION_11_BIN']
# test_df.drop(sus,axis=1,inplace=True)

In [22]:
# new_column = pd.cut(test_df.CARTYPE_13,bins=[0,4.3921793760000005,5.0899999192,5.5847490088,5.9634146304,6.3249734592,6.6978225152,7.0661572663999985,7.4562481424,7.8857293432,8.423108709600001,9.181728172,10.4332589616,100],labels=[0,1,2,3,4,5,6,7,8,9,10,11,12])
# new_column.rename({'CARTYPE_13': 'CARTYPE_13_modified'}, inplace=True)
# test_df = pd.concat([test_df,new_column], axis=1, join='inner')
# test_df.head()

In [23]:
test_df.drop(['METRIC_06','METRIC_11','METRIC_14','METRIC_10','CARTYPE_12'],axis=1,inplace=True)

In [24]:
l = list(test_df.columns.values).remove("id")

In [25]:
for i in test_df.columns:
    if i =="CARTYPE_13":
        continue
    elif i =="id":
        continue
    else:
        test_df[i] = test_df[i].astype('object')

ctype13=test_df["CARTYPE_13"]
ID_valid=test_df["id"]
final_X_test = pd.get_dummies(data=test_df.drop(['CARTYPE_13','id'],axis=1))
final_X_test["CARTYPE_13"] = ctype13
final_X_test["id"] = ID_valid


In [26]:
test_df.isnull().sum()

id                  0
CARTYPE_07_CAT      0
METRIC_18_BIN       0
METRIC_04           0
CONDITION_11_BIN    0
CONDITION_05_CAT    0
CARTYPE_15          0
METRIC_09           0
METRIC_01           0
REGISTRATION_01     0
CONDITION_14        0
METRIC_12           0
METRIC_08           0
CONDITION_02_CAT    0
CARTYPE_10_CAT      0
CONDITION_09_BIN    0
CARTYPE_09_CAT      0
CARTYPE_11          0
CARTYPE_01_CAT      0
CONDITION_13_BIN    0
METRIC_17_BIN       0
CARTYPE_13          0
CARTYPE_06_CAT      0
METRIC_15_BIN       0
CONDITION_03        0
METRIC_20_BIN       0
METRIC_07           0
CARTYPE_02_CAT      0
CONDITION_01        0
CONDITION_18_BIN    0
CONDITION_16_BIN    0
CONDITION_07_BIN    0
CONDITION_17_BIN    0
CARTYPE_11_CAT      0
CONDITION_10_BIN    0
REGISTRATION_02     0
METRIC_16_BIN       0
CONDITION_15        0
METRIC_03           0
METRIC_05           0
METRIC_19_BIN       0
CARTYPE_08_CAT      0
CONDITION_12_BIN    0
CONDITION_06_BIN    0
METRIC_02           0
CARTYPE_04

In [27]:
test_X = final_X_test.drop(axis="columns", labels=['id']).to_numpy().astype(np.float64)
c1 = final_X_test.columns

for i in c:
    if i in c1:
        continue
    else:
        print(i)

target


In [28]:
test_yhat = lreg.predict_proba(test_X)

In [29]:
y_pred_df = pd.DataFrame(data=test_yhat[:,1], columns = ["target"])

submission_df = pd.concat([test_df["id"], y_pred_df["target"]], axis=1, join='inner')
submission_df.to_csv("submission.csv", index = False)
print(submission_df.shape)

(297606, 2)
