In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import catboost
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import KFold

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [5]:
import os
for dirname, _, filenames in os.walk('../data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../data/test.csv.zip
../data/sampleSubmission.csv
../data/train.csv.zip
../data/web_sub.csv


In [6]:
trainval_filename = '../data/train.csv.zip'
test_filename = '../data/test.csv.zip'
df_trainval = pd.read_csv(trainval_filename)
df_test = pd.read_csv(test_filename)
#X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1,test_size=0.1)
print(df_trainval.head(5))

   Id   Open Date        City  City Group Type  P1   P2   P3   P4  P5  ...   
0   0  07/17/1999    İstanbul  Big Cities   IL   4  5.0  4.0  4.0   2  ...  \
1   1  02/14/2008      Ankara  Big Cities   FC   4  5.0  4.0  4.0   1  ...   
2   2  03/09/2013  Diyarbakır       Other   IL   2  4.0  2.0  5.0   2  ...   
3   3  02/02/2012       Tokat       Other   IL   6  4.5  6.0  6.0   4  ...   
4   4  05/09/2009   Gaziantep       Other   IL   3  4.0  3.0  4.0   2  ...   

   P29  P30  P31  P32  P33  P34  P35  P36  P37    revenue  
0  3.0    5    3    4    5    5    4    3    4  5653753.0  
1  3.0    0    0    0    0    0    0    0    0  6923131.0  
2  3.0    0    0    0    0    0    0    0    0  2055379.0  
3  7.5   25   12   10    6   18   12   12    6  2675511.0  
4  3.0    5    1    3    2    3    4    3    3  4316715.0  

[5 rows x 43 columns]


In [7]:
y_trainval = df_trainval['revenue']
del df_trainval['revenue']

In [8]:
print(df_trainval[['City Group','Type']].head())

   City Group Type
0  Big Cities   IL
1  Big Cities   FC
2       Other   IL
3       Other   IL
4       Other   IL


In [9]:
df_all = pd.concat([df_trainval,df_test],axis=0)
df_all['Open Date'] = pd.to_datetime(df_all["Open Date"])
df_all['Year'] = df_all['Open Date'].apply(lambda x:x.year)
df_all['Month'] = df_all['Open Date'].apply(lambda x:x.month)
df_all['Day'] = df_all['Open Date'].apply(lambda x:x.day)
df_all['week_name'] = df_all['Open Date'].apply(lambda x:x.day_name())

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_all['City'] = le.fit_transform(df_all['City'])
df_all['City Group'] = df_all['City Group'].map({'Other':0,'Big Cities':1}) #There are only 'Other' or 'Big city'
df_all["Type"] = df_all["Type"].map({"FC":0, "IL":1, "DT":2, "MB":3}) #There are only 'FC' or 'IL' or 'DT' or 'MB'
print(df_all.head())
df_all["week_name"] = df_all["week_name"].map({"Sunday":0, "Monday":1, "Tuesday":2, "Wednesday":3,"Thursday":4,"Friday":5,"Saturday":6}) #There are only 'FC' or 'IL' or 'DT' or 'MB'
print(df_all.head())

   Id  Open Date  City  City Group  Type  P1   P2   P3   P4  P5  ...  P32   
0   0 1999-07-17    60           1     1   4  5.0  4.0  4.0   2  ...    4  \
1   1 2008-02-14     4           1     0   4  5.0  4.0  4.0   1  ...    0   
2   2 2013-03-09    14           0     1   2  4.0  2.0  5.0   2  ...    0   
3   3 2012-02-02    52           0     1   6  4.5  6.0  6.0   4  ...   10   
4   4 2009-05-09    21           0     1   3  4.0  3.0  4.0   2  ...    3   

   P33  P34  P35  P36  P37  Year  Month  Day  week_name  
0    5    5    4    3    4  1999      7   17   Saturday  
1    0    0    0    0    0  2008      2   14   Thursday  
2    0    0    0    0    0  2013      3    9   Saturday  
3    6   18   12   12    6  2012      2    2   Thursday  
4    2    3    4    3    3  2009      5    9   Saturday  

[5 rows x 46 columns]
   Id  Open Date  City  City Group  Type  P1   P2   P3   P4  P5  ...  P32   
0   0 1999-07-17    60           1     1   4  5.0  4.0  4.0   2  ...    4  \
1   1 2008-0

In [10]:
df_trainval = df_all.iloc[:df_trainval.shape[0]]
df_test = df_all.iloc[df_trainval.shape[0]:]

In [11]:
df_train_col = [col for col in df_trainval.columns if col not in ['Id','Open Date']]
df_trainval = df_trainval[df_train_col]
df_test = df_test[df_train_col]

In [12]:
from sklearn.neighbors import KNeighborsRegressor

class Model1KNN:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_x)
        tr_x = self.scaler.transform(tr_x)
        #params = {"kernel":['rbf'],"C":np.logspace(0,1,params_cnt), "epsilon":np.logspace(-1,1,params_cnt)}
        self.model = KNeighborsRegressor(n_neighbors=5,
                                         #weights='uniform'
                                        )
        
        self.model.fit(tr_x,tr_y)
        
    def predict(self,x):
        x = self.scaler.transform(x)
        pred = self.model.predict(x)
        return pred

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

from keras.callbacks import EarlyStopping

class Model1NN:

    def __init__(self):
        self.model = None
        self.scaler = None
    '''
    def weight_variable(self,shape,name):
        initial =tf.truncated_normal(shape,stddev=0.1)
        return tf.Variable(initial, name=name)

    def bias_variable(self,shape,name):
        initial = tf.constant(0.1,shape=shape)
        return tf.Variable(initial, name=name)
    '''     
    def fit(self, tr_x, tr_y, va_x, va_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_x)
        
        batch_size = 128
        epochs = 10000
        
        tr_x = self.scaler.transform(tr_x)
        va_x = self.scaler.transform(va_x)
        
        early_stopping =  EarlyStopping(
                            monitor='val_loss',
                            min_delta=0.0,
                            patience=20,
        )

        model = Sequential()
        model.add(Dense(32, activation='sigmoid', input_shape=(tr_x.shape[1],)))
        model.add(Dropout(0.2))
        model.add(Dense(32, activation='sigmoid'))
        model.add(Dropout(0.2))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(loss='mean_squared_error', #'categorical_crossentropy',#categorical_crossentropy
                      optimizer='adam')

        history = model.fit(tr_x, tr_y,
                            batch_size=batch_size, epochs=epochs,
                            verbose=1,
                            validation_data=(va_x, va_y),
                            callbacks=[early_stopping])
        self.model = model

    def predict(self, x):
        x = self.scaler.transform(x)
        pred = self.model.predict(x).argmax(axis=1)
        return pred

In [14]:
import lightgbm as lgb

class Model1lgb:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        lgb_params = {'objective': 'rmse',
                  'random_state': 10,
                  'metric': 'rmse'}
        lgb_train = lgb.Dataset(tr_x, label=tr_y)
        lgb_eval = lgb.Dataset(va_x, label=va_y,reference=lgb_train)
        self.model = lgb.train(lgb_params, lgb_train, valid_sets=lgb_eval, num_boost_round=10000,early_stopping_rounds=50)

    def predict(self, x):
        pred = self.model.predict(x,num_iteration=self.model.best_iteration)
        return pred

In [15]:
from sklearn.ensemble import RandomForestRegressor

class Model1RF:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_x)
        tr_x = self.scaler.transform(tr_x)
        self.model = RandomForestRegressor(
            max_depth=5,
            n_estimators=100,
            random_state=10,
        )
        self.model.fit(tr_x,tr_y)
        
    def predict(self,x):
        x = self.scaler.transform(x)
        pred = self.model.predict(x)
        return pred

In [16]:
from sklearn.linear_model import LinearRegression

class Model2Linear:

    def __init__(self):
        self.model = None
        self.scaler = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_x)
        tr_x = self.scaler.transform(tr_x)
        self.model = LinearRegression()
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        x = self.scaler.transform(x)
        pred = self.model.predict(x)
        return pred

In [17]:
def predict_cv(model, train_x, train_y, test_x):
    preds = []
    preds_test = []
    va_idxes = []

    kf = KFold(n_splits=4, shuffle=True, random_state=10)

    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

In [18]:
def train_model_and_predict(model, train_x, train_y):
    preds = []
    va_idxes = []

    kf = KFold(n_splits=4, shuffle=True, random_state=10)

    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        va_idxes.append(va_idx)

    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    return pred_train

In [19]:
def predict(model, train_x, test_x):
    return model.predict(test_x)

In [20]:
model_1a = Model1KNN()
model_1b = Model1NN()
model_1c = Model1RF()
model_1d = Model1lgb()
model2 = Model2Linear()
models = {
    'KNeighbors':model_1a,
    'NeuralNetwork':model_1b,
    'RandomForest':model_1c, 
    'LightGBM': model_1d,
    'Linear':model2,
}

In [21]:

pred_train_1a =  train_model_and_predict(model_1a, df_trainval, y_trainval)
pred_test_1a =  predict(model_1a, df_trainval, df_test)
print(pred_test_1a)

[4780794.8 2022220.6 3853535.2 ... 5085440.4 4693501.6 6261865.6]


In [22]:
pred_train_1b = train_model_and_predict(model_1b, df_trainval, y_trainval)
pred_test_1b = predict(model_1b, df_trainval, df_test)

Epoch 1/10000


2023-05-27 23:36:47.935699: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-05-27 23:36:48.150827: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 

In [23]:
pred_train_1c =  train_model_and_predict(model_1c, df_trainval, y_trainval)
pred_test_1c = predict(model_1c, df_trainval, df_test)
print(pred_test_1c)

[3552793.9669335  3418135.25837476 4024787.2315632  ... 5047910.08258673
 3699354.76512655 5068153.08446716]


In [24]:
pred_train_1d =  train_model_and_predict(model_1d, df_trainval, y_trainval)
pred_test_1d = predict(model_1d, df_trainval, df_test)
print(pred_test_1d)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 332
[LightGBM] [Info] Number of data points in the train set: 102, number of used features: 44
[LightGBM] [Info] Start training from score 4366984.794118
[1]	valid_0's rmse: 2.79678e+06
Training until validation scores don't improve for 50 rounds
[2]	valid_0's rmse: 2.76872e+06
[3]	valid_0's rmse: 2.73529e+06
[4]	valid_0's rmse: 2.70805e+06
[5]	valid_0's rmse: 2.69223e+06
[6]	valid_0's rmse: 2.66282e+06
[7]	valid_0's rmse: 2.64506e+06
[8]	valid_0's rmse: 2.63429e+06
[9]	valid_0's rmse: 2.62025e+06
[10]	valid_0's rmse: 2.60388e+06
[11]	valid_0's rmse: 2.59307e+06
[12]	valid_0's rmse: 2.58971e+06
[13]	valid_0's rmse: 2.58136e+06
[14]	valid_0's rmse: 2.56908e+06
[15]	valid_0's rmse: 2.55642e+06
[16]	valid_0's rmse: 2.55879e+06
[17]	valid_0's rmse: 2.56362e+06
[18]	valid_0's rmse: 2.5527e+06
[19]	valid_0's rmse: 2.55099e+06
[20]	valid_0's rm



[3696177.99325001 3912389.51264069 3848480.40382389 ... 5068819.43735508
 4748719.75528779 5046260.13008347]


In [33]:
from sklearn.metrics import mean_absolute_error

print(f'KNN mean_absolute_error: {mean_absolute_error(y_trainval,pred_train_1a):.4f}')
print(f'MLP mean_absolute_error: {mean_absolute_error(y_trainval,pred_train_1b):.4f}')
print(f'RandomForest mean_absolute_error: {mean_absolute_error(y_trainval,pred_train_1c):.4f}')
print(f'LightGBM mean_absolute_error: {mean_absolute_error(y_trainval,pred_train_1d):.4f}')

stat = pd.DataFrame({
    'KNN':mean_absolute_error(y_trainval,pred_train_1a), 
    'RF':mean_absolute_error(y_trainval,pred_train_1c),
    'LGBM':mean_absolute_error(y_trainval,pred_train_1d)}, index=[0])
stat.head()

KNN mean_absolute_error: 1608334.7431
MLP mean_absolute_error: 4453532.6131
RandomForest mean_absolute_error: 1670680.2016
LightGBM mean_absolute_error: 1560284.4391


Unnamed: 0,KNN,RF,LGBM
0,1608335.0,1670680.0,1560284.0


In [26]:
train_x_2 = pd.DataFrame({'pred_1a': pred_train_1a,
                          'pred_1c': pred_train_1c,
                          'pred_1d': pred_train_1d,
                         })
test_x_2 = pd.DataFrame({'pred_1a': pred_test_1a,
                          'pred_1c': pred_test_1c,
                          'pred_1d': pred_test_1d,
                         })
print(train_x_2)

       pred_1a       pred_1c       pred_1d
0    4387956.4  5.088421e+06  4.750886e+06
1    3748290.6  4.020149e+06  3.735571e+06
2    2770073.6  2.207294e+06  2.775671e+06
3    3709546.8  4.493500e+06  4.438892e+06
4    4463845.2  3.684721e+06  2.670208e+06
..         ...           ...           ...
132  3441556.6  3.680517e+06  3.831758e+06
133  5667006.2  5.506136e+06  6.062351e+06
134  3604085.6  3.679488e+06  3.788841e+06
135  4674186.2  5.466908e+06  4.989295e+06
136  5447667.0  6.654010e+06  5.762310e+06

[137 rows x 3 columns]


In [27]:

pred_train_2 = train_model_and_predict(model2, train_x_2, y_trainval)
pred_test_2 = predict(model2, df_trainval, test_x_2)

In [28]:
print(f'mean_absolute_error: {mean_absolute_error(y_trainval, pred_train_2):.4f}')

mean_absolute_error: 1623166.3113


In [29]:
df_test = pd.read_csv(test_filename)

In [30]:
submission = pd.DataFrame({'Prediction':pred_test_2})
# submission = pd.DataFrame({'Id':df_test['Id'],'Prediction':pred_test_2})

In [31]:
submission.to_csv('./submission3.csv',index=True)