In [51]:

import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import KFold

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [52]:
from sklearn.neighbors import KNeighborsRegressor

class Model1KNN:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_x)
        tr_x = self.scaler.transform(tr_x)
        #params = {"kernel":['rbf'],"C":np.logspace(0,1,params_cnt), "epsilon":np.logspace(-1,1,params_cnt)}
        self.model = KNeighborsRegressor(n_neighbors=5,
                                         #weights='uniform'
                                        )
        
        self.model.fit(tr_x,tr_y)
        
    def predict(self,x):
        x = self.scaler.transform(x)
        pred = self.model.predict(x)
        return pred
    

from keras.models import Sequential
from keras.layers import Dense, Dropout

from keras.callbacks import EarlyStopping

class Model1NN:

    def __init__(self):
        self.model = None
        self.scaler = None
    '''
    def weight_variable(self,shape,name):
        initial =tf.truncated_normal(shape,stddev=0.1)
        return tf.Variable(initial, name=name)

    def bias_variable(self,shape,name):
        initial = tf.constant(0.1,shape=shape)
        return tf.Variable(initial, name=name)
    '''     
    def fit(self, tr_x, tr_y, va_x, va_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_x)
        
        batch_size = 128
        epochs = 10000
        
        tr_x = self.scaler.transform(tr_x)
        va_x = self.scaler.transform(va_x)
        
        early_stopping =  EarlyStopping(
                            monitor='val_loss',
                            min_delta=0.0,
                            patience=20,
        )

        model = Sequential()
        model.add(Dense(32, activation='relu', input_shape=(tr_x.shape[1],)))
        model.add(Dropout(0.5))
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(loss='mean_squared_error', #'categorical_crossentropy',#categorical_crossentropy
                      optimizer='adam')

        history = model.fit(tr_x, tr_y,
                            batch_size=batch_size, epochs=epochs,
                            verbose=1,
                            validation_data=(va_x, va_y),
                            callbacks=[early_stopping])
        self.model = model

    def predict(self, x):
        x = self.scaler.transform(x)
        pred = self.model.predict(x).argmax(axis=1)
        return pred
    

import lightgbm as lgb

class Model1lgb:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        lgb_params = {'objective': 'rmse',
                  'random_state': 10,
                  'metric': 'rmse'}
        lgb_train = lgb.Dataset(tr_x, label=tr_y)
        lgb_eval = lgb.Dataset(va_x, label=va_y,reference=lgb_train)
        self.model = lgb.train(lgb_params, lgb_train, valid_sets=lgb_eval, num_boost_round=10000,early_stopping_rounds=50)

    def predict(self, x):
        pred = self.model.predict(x,num_iteration=self.model.best_iteration)
        return pred
    
from sklearn.ensemble import RandomForestRegressor

class Model1RF:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_x)
        tr_x = self.scaler.transform(tr_x)
        self.model = RandomForestRegressor(
            max_depth=5,
            n_estimators=100,
            random_state=10,
        )
        self.model.fit(tr_x,tr_y)
        
    def predict(self,x):
        x = self.scaler.transform(x)
        pred = self.model.predict(x)
        return pred
    

from sklearn.linear_model import LinearRegression

class Model2Linear:

    def __init__(self):
        self.model = None
        self.scaler = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_x)
        tr_x = self.scaler.transform(tr_x)
        self.model = LinearRegression()
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        x = self.scaler.transform(x)
        pred = self.model.predict(x)
        return pred

In [53]:
def prepare_data(train, test):
    df_trainval = pd.read_csv(train)
    df_test = pd.read_csv(test)

    y_trainval = df_trainval['revenue']
    del df_trainval['revenue']

    df_all = pd.concat([df_trainval,df_test],axis=0)
    df_all['Open Date'] = pd.to_datetime(df_all["Open Date"])
    df_all['Year'] = df_all['Open Date'].apply(lambda x:x.year)
    df_all['Month'] = df_all['Open Date'].apply(lambda x:x.month)
    df_all['Day'] = df_all['Open Date'].apply(lambda x:x.day)
    df_all['week_name'] = df_all['Open Date'].apply(lambda x:x.day_name())

    le = LabelEncoder()
    df_all['City'] = le.fit_transform(df_all['City'])
    df_all['City Group'] = df_all['City Group'].map({'Other':0,'Big Cities':1}) #There are only 'Other' or 'Big city'
    df_all["Type"] = df_all["Type"].map({"FC":0, "IL":1, "DT":2, "MB":3}) #There are only 'FC' or 'IL' or 'DT' or 'MB'
    df_all["week_name"] = df_all["week_name"].map({"Sunday":0, "Monday":1, "Tuesday":2, "Wednesday":3,"Thursday":4,"Friday":5,"Saturday":6}) #There are only 'FC' or 'IL' or 'DT' or 'MB'

    df_trainval = df_all.iloc[:df_trainval.shape[0]]

    df_test = df_all.iloc[df_trainval.shape[0]:]
    df_train_col = [col for col in df_trainval.columns if col not in ['Id','Open Date']]
    df_trainval = df_trainval[df_train_col]
    df_test = df_test[df_train_col]
    print(df_test)
    print(df_trainval)
    return df_trainval, y_trainval, df_test

In [54]:
def prepare_test_data(test):
    df_test = pd.read_csv(test)

    df_all = pd.concat([df_test],axis=0)
    df_all['Open Date'] = pd.to_datetime(df_all["Open Date"])
    df_all['Year'] = df_all['Open Date'].apply(lambda x:x.year)
    df_all['Month'] = df_all['Open Date'].apply(lambda x:x.month)
    df_all['Day'] = df_all['Open Date'].apply(lambda x:x.day)
    df_all['week_name'] = df_all['Open Date'].apply(lambda x:x.day_name())

    le = LabelEncoder()
    df_all['City'] = le.fit_transform(df_all['City'])
    df_all['City Group'] = df_all['City Group'].map({'Other':0,'Big Cities':1}) #There are only 'Other' or 'Big city'
    df_all["Type"] = df_all["Type"].map({"FC":0, "IL":1, "DT":2, "MB":3}) #There are only 'FC' or 'IL' or 'DT' or 'MB'
    df_all["week_name"] = df_all["week_name"].map({"Sunday":0, "Monday":1, "Tuesday":2, "Wednesday":3,"Thursday":4,"Friday":5,"Saturday":6}) #There are only 'FC' or 'IL' or 'DT' or 'MB'
    df_test = df_all
    df_train_col = [col for col in df_test.columns if col not in ['Id','Open Date']]
    df_test = df_test[df_train_col]
    print(df_test)
    return df_test

In [55]:
def prepare_train_data(train):
    df_trainval = pd.read_csv(train)

    y_trainval = df_trainval['revenue']
    del df_trainval['revenue']

    df_all = pd.concat([df_trainval],axis=0)
    df_all['Open Date'] = pd.to_datetime(df_all["Open Date"])
    df_all['Year'] = df_all['Open Date'].apply(lambda x:x.year)
    df_all['Month'] = df_all['Open Date'].apply(lambda x:x.month)
    df_all['Day'] = df_all['Open Date'].apply(lambda x:x.day)
    df_all['week_name'] = df_all['Open Date'].apply(lambda x:x.day_name())

    le = LabelEncoder()
    df_all['City'] = le.fit_transform(df_all['City'])
    df_all['City Group'] = df_all['City Group'].map({'Other':0,'Big Cities':1}) #There are only 'Other' or 'Big city'
    df_all["Type"] = df_all["Type"].map({"FC":0, "IL":1, "DT":2, "MB":3}) #There are only 'FC' or 'IL' or 'DT' or 'MB'
    df_all["week_name"] = df_all["week_name"].map({"Sunday":0, "Monday":1, "Tuesday":2, "Wednesday":3,"Thursday":4,"Friday":5,"Saturday":6}) #There are only 'FC' or 'IL' or 'DT' or 'MB'

    df_trainval = df_all.iloc[:df_trainval.shape[0]]
    df_train_col = [col for col in df_trainval.columns if col not in ['Id','Open Date']]
    df_trainval = df_trainval[df_train_col]
    print(df_trainval)
    return df_trainval, y_trainval

In [56]:
def train_model_and_predict(model, train_x, train_y):
    preds = []
    va_idxes = []

    kf = KFold(n_splits=4, shuffle=True, random_state=10)

    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        va_idxes.append(va_idx)

    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    return pred_train

In [57]:
def predict(model, test_x):
    return model.predict(test_x)

In [64]:
def fitting_models(models:dict, df_trainval, y_trainval):
    predict_train = []
    lin = models.pop('Linear')
    for model in models.values():
        predict_train.append(train_model_and_predict(model, df_trainval, y_trainval))
    pred_dict = {}
    for i, sublists in enumerate(predict_train, start=1):
        pred_dict[i] = sublists

    train_x_2 = pd.DataFrame(pred_dict)
    train_model_and_predict(lin, train_x_2, y_trainval)
    return lin 

In [66]:
def predict_revenue(final_regressor, models:dict, df_test):
    predict_test = []
    
    for model in models.values():
        predict_test.append(predict(model, df_test))
    pred_dict = {}
    for i, sublists in enumerate(predict_test, start=1):
        pred_dict[i] = sublists

    test_x_2 = pd.DataFrame(pred_dict)
    return predict(final_regressor, test_x_2)

def save_to_cvs(prediction):
    submission = pd.DataFrame({'Prediction':prediction})
    submission.to_csv('./submission4.csv',index=True) 


In [60]:
trainval_filename = '../data/train.csv.zip'
test_filename = '../data/test.csv.zip'
df_trainval, y_trainval = prepare_train_data(trainval_filename)

     City  City Group  Type  P1   P2   P3   P4  P5  P6  P7  ...  P32  P33   
0      31           1     1   4  5.0  4.0  4.0   2   2   5  ...    4    5  \
1       3           1     0   4  5.0  4.0  4.0   1   2   5  ...    0    0   
2      10           0     1   2  4.0  2.0  5.0   2   3   5  ...    0    0   
3      28           0     1   6  4.5  6.0  6.0   4   4  10  ...   10    6   
4      14           0     1   3  4.0  3.0  4.0   2   2   5  ...    3    2   
..    ...         ...   ...  ..  ...  ...  ...  ..  ..  ..  ...  ...  ...   
132    29           0     0   2  3.0  3.0  5.0   4   2   4  ...    0    0   
133    32           1     0   4  5.0  4.0  4.0   2   3   5  ...    0    0   
134    18           0     0   3  4.0  4.0  4.0   2   3   5  ...    0    0   
135    31           1     0   4  5.0  4.0  5.0   2   2   5  ...    0    0   
136    31           1     0   4  5.0  3.0  5.0   2   2   5  ...    0    0   

     P34  P35  P36  P37  Year  Month  Day  week_name  
0      5    4    3  

In [61]:
df_test = prepare_test_data(test_filename)

       City  City Group  Type  P1   P2   P3   P4  P5  P6  P7  ...  P32  P33   
0        38           0     0   1  4.0  4.0  4.0   1   2   5  ...    0    0  \
1        27           0     1   3  4.0  4.0  4.0   2   2   5  ...    0    0   
2         3           1     0   3  4.0  4.0  4.0   2   2   5  ...    0    0   
3        26           0     1   2  4.0  4.0  4.0   2   3   5  ...    0    0   
4         1           0     0   2  4.0  4.0  4.0   1   2   5  ...    0    0   
...     ...         ...   ...  ..  ...  ...  ...  ..  ..  ..  ...  ...  ...   
99995     4           0     0   5  5.0  4.0  4.0   2   2   5  ...    0    0   
99996    38           0     1   1  2.0  4.0  3.0   1   1   1  ...    0    0   
99997    54           1     1   4  5.0  4.0  4.0   1   2   5  ...    3    2   
99998    54           1     0  12  7.5  6.0  6.0   4   4  10  ...    0    4   
99999    54           1     1   2  5.0  4.0  4.0   2   2   5  ...    0    2   

       P34  P35  P36  P37  Year  Month  Day  week_n

In [70]:
model_1a = Model1KNN()
model_1b = Model1NN()
model_1c = Model1RF()
model_1d = Model1lgb()
model2 = Model2Linear()
models = {
    'KNeighbors':model_1a,
    'NeuralNetwork':model_1b,
    'RandomForest':model_1c, 
    'LightGBM': model_1d,
    'Linear':model2,
}

lin = fitting_models(models, df_trainval, y_trainval)
prediction = predict_revenue(lin ,models, df_test)

save_to_cvs(prediction)

Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 332
[LightGBM] [Info] Number of data points in the train set: 102, number of used features: 44
[LightGBM] [Info] Start training from score 4366984.794118
[1]	valid_0's rmse: 2.79678e+06
Training until validation scores don't improve for 50 rounds
[2]	valid_0's rmse: 2.76872e+06
[3]	valid_0's rmse: 2.73529e+06
[4]	valid_0's rmse: 2.70805e+06
[5]	valid_0's rmse: 2.69223e+06
[6]	valid_0's rmse: 2.66282e+06
[7]	valid_0's rmse: 2.64506e+06
[8]	valid_0's rmse: 2.63429e+06
[9]	valid_0's rmse: 2.62025e+06
[10]	valid_0's rmse: 2.60388e+06
[11]	valid_0's rmse: 2.59307e+06
[12]	valid_0's rmse: 2.58971e+06
[13]	valid_0's rmse: 2.58136e+06
[14]	valid_0's rmse: 2.56908e+06
[15]	valid_0's rmse: 2.55642e+06
[16]	valid_0's rmse: 2.55879e+06
[17]	valid_0's rmse: 2.56362e+06
[18]	valid_0's rmse: 2.5527e+06
[19]	valid_0's rmse: 2.55099e+06
[20]	valid_0's rm

2023-05-25 15:11:30.438636: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 17600000 exceeds 10% of free system memory.


