In [1]:
import glob as glob
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
PATH = "model_results"

In [32]:
def gather_file(suffix):
    return glob.glob(f"{PATH}/*{suffix}.csv")

In [33]:
gather_file("_train")

['model_results/gbm_y_train.csv',
 'model_results/y_train.csv',
 'model_results/nn_y_train.csv',
 'model_results/svr_y_train.csv']

In [34]:
def get_y(file):
    df = pd.read_csv(file)
    return df['y'].to_numpy()

In [37]:
def gather_data(suffix):
    data = {}
    files = gather_file(suffix)
    for file in files:
        data[os.path.basename(file)[:-4].replace(suffix, "")] = get_y(file)
    return pd.DataFrame(data).dropna()

In [38]:
train_data = gather_data("_train")

In [39]:
train_data.columns

Index(['gbm_y', 'y', 'nn_y', 'svr_y'], dtype='object')

In [40]:
test_data = gather_data("_test")

In [42]:
columns = [column for column in list(train_data.columns) if column != 'y']
X = train_data[columns]
y = train_data['y']

In [43]:
X

Unnamed: 0,gbm_y,nn_y,svr_y
0,70.603613,74.776430,71.100175
1,72.022080,82.005490,72.900135
2,65.104640,69.469090,66.100194
3,57.515879,53.482124,55.099913
4,67.545963,65.903595,66.899800
...,...,...,...
1204,65.439919,59.998520,66.900106
1205,48.102046,56.331356,48.100342
1206,80.264049,78.337650,81.899754
1207,75.927829,71.979320,76.099838


In [44]:
y

0       71.0
1       73.0
2       66.0
3       55.0
4       67.0
        ... 
1204    67.0
1205    48.0
1206    82.0
1207    76.0
1208    81.0
Name: y, Length: 1209, dtype: float64

In [45]:
regressor = LinearRegression()
regressor.fit(X, y)

LinearRegression()

In [48]:
regressor.predict(X)

array([71.05150568, 72.88061269, 65.7959667 , ..., 81.9792597 ,
       76.22795744, 81.04239804])

In [49]:
y

0       71.0
1       73.0
2       66.0
3       55.0
4       67.0
        ... 
1204    67.0
1205    48.0
1206    82.0
1207    76.0
1208    81.0
Name: y, Length: 1209, dtype: float64

In [46]:
y_pred = regressor.predict(test_data)

In [47]:
y_pred

array([69.24552372, 66.05421144, 70.87613072, 68.37075973, 70.61372876,
       75.62361894, 61.66265123, 60.58671958, 78.50936017, 82.78272515,
       61.84640231, 83.76916745, 72.36109721, 81.80669344, 58.57472969,
       84.39229331, 71.85961568, 74.79840791, 70.99370858, 67.94270189,
       68.77319267, 68.6727972 , 76.96783427, 77.12939234, 61.85363461,
       70.3114271 , 56.09859318, 70.68255888, 65.74783512, 77.37098197,
       64.90021846, 71.40375065, 59.01491284, 73.46254936, 66.65803294,
       63.52398173, 76.82236147, 62.17685541, 70.94851566, 71.34782354,
       71.75519292, 77.14603628, 71.12683558, 78.3571097 , 79.53568098,
       68.77074695, 70.55652977, 76.03698243, 78.67745801, 66.6236828 ,
       75.05325264, 68.48521383, 63.75188145, 79.04145354, 79.55598501,
       74.15366749, 65.40262791, 72.01818857, 71.71099737, 62.45412909,
       62.53576856, 75.34703738, 64.94548573, 71.32471152, 75.21787827,
       57.29367695, 73.6229669 , 80.69685968, 65.44180841, 75.30

In [50]:
X_test = pd.read_csv("X_test.csv")
df_ids = pd.DataFrame(X_test['id'])
df_predictions = df_ids.join(pd.DataFrame(y_pred, columns=['y']))
df_predictions.to_csv('model_results/predictions_ensemble.csv', index=False)

In [79]:
regressor.coef_

array([0.16953751, 0.00606922, 0.85821248])

# Maybe Random Forest would be better?

In [65]:
from sklearn.ensemble import RandomForestRegressor

In [66]:
model = RandomForestRegressor(n_estimators=1000, max_depth=7, random_state=0)
model.fit(X, y)

RandomForestRegressor(max_depth=7, n_estimators=1000, random_state=0)

In [72]:
y_pred = model.predict(test_data)

In [73]:
y_pred

array([70.01487517, 64.90593606, 71.78472591, 68.0162201 , 71.34113306,
       76.93840103, 61.98866976, 60.63121891, 81.24762282, 81.99658333,
       61.51816421, 83.        , 73.19323457, 81.0353857 , 56.04452647,
       81.0186357 , 71.99979754, 74.84760717, 70.98902839, 67.17989854,
       66.99236191, 67.41771142, 76.96620624, 77.07891234, 61.52502135,
       70.57125748, 56.06030683, 70.6942434 , 66.24356647, 77.07205101,
       64.50144567, 71.64430041, 58.8896809 , 73.96754414, 67.02126588,
       63.47611108, 77.14419917, 62.85564395, 71.37706071, 71.62997793,
       71.86888645, 77.02728659, 71.13175214, 77.59154082, 80.01460522,
       68.29812191, 71.5299032 , 75.76668652, 78.65471403, 68.24566822,
       76.18532288, 68.34353967, 63.45057868, 77.54715648, 80.43870532,
       74.62346759, 66.51466659, 71.57042468, 70.87217532, 62.65326596,
       62.8163825 , 75.68126585, 64.63311399, 71.28489674, 76.19025531,
       57.29182455, 74.25925297, 80.11697919, 64.82254793, 74.93

In [74]:
X_test = pd.read_csv("X_test.csv")
df_ids = pd.DataFrame(X_test['id'])
df_predictions = df_ids.join(pd.DataFrame(y_pred, columns=['y']))
df_predictions.to_csv('model_results/predictions_ensemble_rf.csv', index=False)

In [75]:
model.predict(X)

array([71.03409074, 73.18445985, 65.34666989, ..., 81.99658333,
       76.04490841, 80.99942072])

In [76]:
y

0       71.0
1       73.0
2       66.0
3       55.0
4       67.0
        ... 
1204    67.0
1205    48.0
1206    82.0
1207    76.0
1208    81.0
Name: y, Length: 1209, dtype: float64