In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [18]:
random_state=1234

# 1. EDA

## 1.1 Load Data

In [2]:
train = pd.read_csv("../../input/train.csv")
test = pd.read_csv('../../input/test.csv')

## 1.2 Data Shape

In [3]:
print("train set shape : ", train.shape)
train.head()

train set shape :  (4209, 378)


Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
print("test set shape : ", test.shape)
test.head()

test set shape :  (4209, 377)


Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


## 1.3 Export Target Feature

In [5]:
train_y = train['y']
train.drop(['y'], axis=1, inplace=True)

## 1.4 Export ID Feature

In [6]:
train_ID = train['ID']
test_ID = test["ID"]

## 1.5 Merge train and test

In [7]:
data = pd.concat([train, test], axis=0)
data.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
1,6,k,t,av,e,d,y,l,o,0,...,1,0,0,0,0,0,0,0,0,0
2,7,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,1,0,0,0
3,9,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,0
4,13,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# sort by ID
data.sort_values(by="ID", inplace=True)
data.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0


## 1.6 Export merged data's ID

In [9]:
data_ID = data["ID"]
data.drop(['ID'], axis=1, inplace=True)
data.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,k,v,at,a,d,u,j,o,0,0,...,0,0,1,0,0,0,0,0,0,0
0,az,v,n,f,d,t,a,w,0,0,...,0,0,0,1,0,0,0,0,0,0
1,t,b,ai,a,d,b,g,y,0,0,...,0,0,1,0,0,0,0,0,0,0
2,az,v,as,f,d,a,j,j,0,0,...,0,0,0,1,0,0,0,0,0,0
3,az,l,n,f,d,z,l,n,0,0,...,0,0,0,1,0,0,0,0,0,0


## 1.7 Split data to categorical and numerical

In [10]:
data.dtypes.value_counts()

int64     368
object      8
dtype: int64

In [11]:
data_cat_features = data.select_dtypes(include=['object']).columns
data_cat = data[data_cat_features]
data_cat.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
0,az,v,n,f,d,t,a,w
1,t,b,ai,a,d,b,g,y
2,az,v,as,f,d,a,j,j
3,az,l,n,f,d,z,l,n


In [12]:
data_num_features = data.select_dtypes(include=['int64']).columns
data_num = data[data_num_features]
data_num.head()

Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


## 1.8 One-hot Encoding to Categorical Features

In [13]:
data_cat = pd.get_dummies(data_cat)
data_cat.head()

Unnamed: 0,X0_a,X0_aa,X0_ab,X0_ac,X0_ad,X0_ae,X0_af,X0_ag,X0_ai,X0_aj,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 1.9 Merge cat and num

In [14]:
data = pd.concat([data_ID, data_cat, data_num], axis=1)
data.head()

Unnamed: 0,ID,X0_a,X0_aa,X0_ab,X0_ac,X0_ad,X0_ae,X0_af,X0_ag,X0_ai,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


## 1.10 Split data to train and test

In [15]:
train = data[data.ID.isin(list(train_ID))]

print("train set shape : ", train.shape)
train.head()

train set shape :  (4209, 580)


Unnamed: 0,ID,X0_a,X0_aa,X0_ab,X0_ac,X0_ad,X0_ae,X0_af,X0_ag,X0_ai,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,6,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
train.shape

(4209, 580)

In [17]:
test = data[data.ID.isin(list(test_ID))]

print("test set shape : ", test.shape)
test.head()

test set shape :  (4209, 580)


Unnamed: 0,ID,X0_a,X0_aa,X0_ab,X0_ac,X0_ad,X0_ae,X0_af,X0_ag,X0_ai,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


# 2. Modeling

## 2.1 Split Data to train and validation

In [19]:
train_X, valid_X, train_y, valid_y = train_test_split(train,
                                                      train_y,
                                                      test_size=0.3,
                                                      random_state=random_state)

## 2.3 Fit Model

In [20]:
lr = LinearRegression()

lr.fit(train_X, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## 2.4 Get predict

In [21]:
train_y_pred = lr.predict(train_X)
valid_y_pred = lr.predict(valid_X)

# 3. Evaluation

## 3.1 RMSE

In [23]:
train_mse = mean_squared_error(train_y, train_y_pred)
valid_mse = mean_squared_error(valid_y, valid_y_pred)

train_rmse = np.sqrt(train_mse)
valid_rmse = np.sqrt(valid_mse)

print("train set's rmse : ", train_rmse)
print("valid set's rmse : ", valid_rmse)

train set's rmse :  7.369491371496822
valid set's rmse :  9.502842857043387


## 3.2 $R^2$

In [24]:
train_r2 = r2_score(train_y, train_y_pred)
# train_r2 = lr.score(train_y, train_y_pred)
valid_r2 = r2_score(valid_y, valid_y_pred)
# valid_r2 = lr.score(valid_y, valid_y_pred)

print("train set's r^2 score : ", train_r2)
print("valid set's r^2 score : ", valid_r2)

train set's r^2 score :  0.6509778190695636
valid set's r^2 score :  0.4769454954924194


# 4. Make submission

In [25]:
test_y_pred = lr.predict(test)

In [27]:
test_y_pred

array([ 65.53805554, 108.16354624,  75.42018922, ...,  94.46649458,
       107.86834622,  92.24318504])

In [30]:
output = pd.DataFrame({'id': list(test_ID), 'y': test_y_pred})
output.head()

Unnamed: 0,id,y
0,1,65.538056
1,2,108.163546
2,3,75.420189
3,4,72.588103
4,5,103.259246


In [31]:
output.to_csv("v002_Benz_submission.csv", index=False)