In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# 1. EDA

## 1.1 Load Data

In [2]:
train = pd.read_csv("../../input/train.csv")
test = pd.read_csv('../../input/test.csv')

## 1.2 Data Shape

### 1.2.1 train data

In [4]:
print("train set shape : ", train.shape)
train.head()

train set shape :  (4209, 378)


Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


### 1.2.2 test data

In [5]:
print("test set shape : ", test.shape)
test.head()

test set shape :  (4209, 377)


Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


## 1.3 Export Target Feature

In [6]:
train_y = train['y']
train.drop(['y'], axis=1, inplace=True)

## 1.4 Export ID Feature

In [7]:
train_ID = train['ID']
train.drop(['ID'], axis=1, inplace=True)

test_ID = test['ID']
test.drop(['ID'], axis=1, inplace=True)

## 1.5 Split Data to Categorical and Numerical

### 1.5.1 train data

In [8]:
train.dtypes.value_counts()

int64     368
object      8
dtype: int64

In [9]:
train_cat_features = train.select_dtypes(include=['object']).columns

train_cat = train[train_cat_features]
train_cat.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n


In [10]:
train_num_features = train.select_dtypes(include=['int64']).columns

train_num = train[train_num_features]
train_num.head()

Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 1.5.2 test data

In [11]:
test.dtypes.value_counts()

int64     368
object      8
dtype: int64

In [12]:
test_cat_features = test.select_dtypes(include=['object']).columns

test_cat = test[test_cat_features]
test_cat.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,az,v,n,f,d,t,a,w
1,t,b,ai,a,d,b,g,y
2,az,v,as,f,d,a,j,j
3,az,l,n,f,d,z,l,n
4,w,s,as,c,d,y,i,m


In [13]:
test_num_features = train.select_dtypes(include=['int64']).columns

test_num = test[test_num_features]
test_num.head()

Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


## 1.6 One-hot Encoding to Categorical Features

### 1.6.1 train data

In [14]:
train_cat = pd.get_dummies(train_cat)
train_cat.head()

Unnamed: 0,X0_a,X0_aa,X0_ab,X0_ac,X0_ad,X0_af,X0_ai,X0_aj,X0_ak,X0_al,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 1.6.2 test data

In [15]:
test_cat = pd.get_dummies(test_cat)
test_cat.head()

Unnamed: 0,X0_a,X0_ad,X0_ae,X0_af,X0_ag,X0_ai,X0_aj,X0_ak,X0_al,X0_am,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 1.7 Merge Data

### 1.7.1 train data

In [16]:
train = pd.concat([train_cat, train_num], axis=1)
train.head()

Unnamed: 0,X0_a,X0_aa,X0_ab,X0_ac,X0_ad,X0_af,X0_ai,X0_aj,X0_ak,X0_al,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 1.7.2 test data

In [17]:
test = pd.concat([test_cat, test_num], axis=1)
test.head()

Unnamed: 0,X0_a,X0_ad,X0_ae,X0_af,X0_ag,X0_ai,X0_aj,X0_ak,X0_al,X0_am,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [31]:
test.shape

(4209, 569)

# 2. Modeling

## 2.1 Split Data to Train and Validation

In [19]:
train_X, valid_X, train_y, valid_y = train_test_split(train, train_y, test_size=0.3, random_state=1234)

In [30]:
train_X.shape

(2946, 563)

In [32]:
valid_X.shape

(1263, 563)

## 2.2 Fit to Model

In [20]:
lr = LinearRegression()

In [21]:
lr.fit(train_X, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## 2.3 Get Predict

In [22]:
train_y_pred = lr.predict(train_X)
valid_y_pred = lr.predict(valid_X)

# 3. Evaluation

## 3.1 RMSE

In [24]:
train_mse = mean_squared_error(train_y, train_y_pred)
valid_mse = mean_squared_error(valid_y, valid_y_pred)

train_rmse = np.sqrt(train_mse)
valid_rmse = np.sqrt(valid_mse)

print("train set's rmse : ", train_rmse)
print("valid set's rmse : ", valid_rmse)

train set's rmse :  7.3799074591465486
valid set's rmse :  1643685190919.2847


## 3.2 $R^2$

In [26]:
train_r2 = r2_score(train_y, train_y_pred)
# train_r2 = lr.score(train_y, train_y_pred)
valid_r2 = r2_score(valid_y, valid_y_pred)
# valid_r2 = lr.score(valid_y, valid_y_pred)

print("train set's r^2 score : ", train_r2)
print("valid set's r^2 score : ", valid_r2)

train set's r^2 score :  0.6499905013223062
valid set's r^2 score :  -1.5648659322638843e+22


# 4. Make Submission

In [27]:
test_y_pred = lr.predict(test)

ValueError: shapes (4209,569) and (563,) not aligned: 569 (dim 1) != 563 (dim 0)