In [1]:
import numpy as np
import pandas as pd 
import math
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
stores = pd.read_csv('stores.csv')
features = pd.read_csv('features.csv')

In [3]:
df = train.merge(stores, how='left').merge(features, how='left')
test_merge = test.merge(stores, how='left').merge(features, how='left')

In [4]:
def split_date(df0):
    df0['Date'] = pd.to_datetime(df0['Date'])
    df0['Year'] = df0['Date'].dt.year
    df0['Month'] = df0['Date'].dt.month
    df0['Day'] = df0['Date'].dt.day
    df0['Week_of_year'] = df0['Date'].dt.isocalendar().week*1.0
    return df0.head(5)

In [5]:
split_date(df)
split_date(test_merge)

Unnamed: 0,Store,Dept,Date,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Year,Month,Day,Week_of_year
0,1,1,2012-11-02,False,A,151315,55.32,3.386,6766.44,5147.7,50.82,3639.9,2737.42,223.462779,6.573,2012,11,2,44.0
1,1,1,2012-11-09,False,A,151315,61.24,3.314,11421.32,3370.89,40.28,4646.79,6154.16,223.481307,6.573,2012,11,9,45.0
2,1,1,2012-11-16,False,A,151315,52.92,3.252,9696.28,292.1,103.78,1133.15,6612.69,223.512911,6.573,2012,11,16,46.0
3,1,1,2012-11-23,True,A,151315,56.23,3.211,883.59,4.17,74910.32,209.91,303.32,223.561947,6.573,2012,11,23,47.0
4,1,1,2012-11-30,False,A,151315,52.34,3.207,2460.03,,3838.35,150.57,6966.34,223.610984,6.573,2012,11,30,48.0


In [6]:
df['Type_trans'] = df['Type'].map({'A':1, 'B':2, 'C':3})
test_merge['Type_trans'] = test_merge['Type'].map({'A':1, 'B':2, 'C':3})

In [7]:
df

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,...,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Year,Month,Day,Week_of_year,Type_trans
0,1,1,2010-02-05,24924.50,False,A,151315,42.31,2.572,,...,,,,211.096358,8.106,2010,2,5,5.0,1
1,1,1,2010-02-12,46039.49,True,A,151315,38.51,2.548,,...,,,,211.242170,8.106,2010,2,12,6.0,1
2,1,1,2010-02-19,41595.55,False,A,151315,39.93,2.514,,...,,,,211.289143,8.106,2010,2,19,7.0,1
3,1,1,2010-02-26,19403.54,False,A,151315,46.63,2.561,,...,,,,211.319643,8.106,2010,2,26,8.0,1
4,1,1,2010-03-05,21827.90,False,A,151315,46.50,2.625,,...,,,,211.350143,8.106,2010,3,5,9.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421565,45,98,2012-09-28,508.37,False,B,118221,64.88,3.997,4556.61,...,1.50,1601.01,3288.25,192.013558,8.684,2012,9,28,39.0,2
421566,45,98,2012-10-05,628.10,False,B,118221,64.89,3.985,5046.74,...,18.82,2253.43,2340.01,192.170412,8.667,2012,10,5,40.0,2
421567,45,98,2012-10-12,1061.02,False,B,118221,54.47,4.000,1956.28,...,7.89,599.32,3990.54,192.327265,8.667,2012,10,12,41.0,2
421568,45,98,2012-10-19,760.01,False,B,118221,56.47,3.969,2004.02,...,3.18,437.73,1537.49,192.330854,8.667,2012,10,19,42.0,2


In [8]:
df_model = df.drop(columns=['Temperature', 'Type', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
                'CPI', 'Unemployment', 'Date', 'Month', 'Day'])

In [9]:
# label encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_model['IsHoliday'] = le.fit_transform(df_model['IsHoliday'])

In [10]:
df_model

Unnamed: 0,Store,Dept,Weekly_Sales,IsHoliday,Size,Year,Week_of_year,Type_trans
0,1,1,24924.50,0,151315,2010,5.0,1
1,1,1,46039.49,1,151315,2010,6.0,1
2,1,1,41595.55,0,151315,2010,7.0,1
3,1,1,19403.54,0,151315,2010,8.0,1
4,1,1,21827.90,0,151315,2010,9.0,1
...,...,...,...,...,...,...,...,...
421565,45,98,508.37,0,118221,2012,39.0,2
421566,45,98,628.10,0,118221,2012,40.0,2
421567,45,98,1061.02,0,118221,2012,41.0,2
421568,45,98,760.01,0,118221,2012,42.0,2


In [11]:
X = df_model.drop(columns=['Weekly_Sales'])
y = df_model['Weekly_Sales']

In [12]:
# scaler = MinMaxScaler().fit(X)
# X_scaler = pd.DataFrame(scaler.transform(X), columns=X.columns)
X_scaler = X

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_scaler, y, test_size=0.2, random_state=123)

In [14]:
def WMAE(data, prediction, actual):
    weights = data['IsHoliday'].apply(lambda x: 5 if x==1 else 1)
    return np.round(np.sum(weights*abs(prediction-actual))/weights.sum(), 2)

In [15]:
year = X_train['Year']
X_train.drop('Year', axis=1, inplace=True)
X_train['Year'] = year
X_train

Unnamed: 0,Store,Dept,IsHoliday,Size,Week_of_year,Type_trans,Year
277233,29,1,0,93638,41.0,2,2012
54699,6,46,0,202505,15.0,1,2010
377655,40,56,0,155083,22.0,1,2012
63396,7,33,0,70713,3.0,2,2012
5876,1,45,0,151315,20.0,1,2011
...,...,...,...,...,...,...,...
192476,20,38,0,203742,23.0,1,2011
17730,2,71,0,202307,24.0,1,2010
28030,3,81,0,37392,35.0,2,2010
277869,29,6,0,93638,14.0,2,2011


In [16]:
model4 = RandomForestRegressor(random_state=123, n_jobs=-1).fit(X_train, y_train)
#train error
forest_train_wmae = WMAE(X_train, model4.predict(X_train), y_train)

#validation error
y_pred = model4.predict(X_val)
forest_val_wmae = WMAE(X_val, y_pred, y_val)

#results:
print('Training dataset WMAE is', forest_train_wmae)
print('Validation dataset WMAE is', forest_val_wmae)


Feature names must be in the same order as they were in fit.



Training dataset WMAE is 557.22
Validation dataset WMAE is 6398.46


In [17]:
# save model .under .sav format
import pickle
filename = 'RandomForest.sav'
pickle.dump(model4, open(filename, 'wb'))

In [9]:
set2={"hello", "hi", "xin chao"}

In [10]:
print(set2)



{'hello', 'xin chao', 'hi'}


In [11]:
print(set2)

{'hello', 'xin chao', 'hi'}


In [8]:
print(set2)

{8, 5, 6, 7}


In [5]:
myset = {5,7,6,4,4,4,4,4,4}
for item in myset:
   print(item)

4
5
6
7


In [1]:
myset = {5,7,6,4}
for item in myset:
   print(item)

4
5
6
7


In [2]:
myset = {5,7,6,4}
for item in myset:
   print(item)

4
5
6
7


In [None]:
import pickle
#load pickle file
filename = 'RandomForest.pkl'
load = pickle.load(open(filename, 'rb'))
load.
