In [2]:
# ! pip install scikit-learn 
# ! pip install numpy
# ! pip install pandas

In [3]:
import os
import pandas as pd
import numpy as np
import csv

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn import set_config 

In [4]:
def preprocessData(singleData):
    singleData[5] -= 24.5
    singleData[6] -= 121
    if singleData[3] == 0:
        singleData[3] = 24
    if singleData[4] == 0:
        singleData[4] = 60
    singleData[4] /= 60
      
    return singleData

def readDataset(data_filepath, inference_filepath):
    assert os.path.exists(data_filepath)
    filenames = os.listdir(data_filepath)
    if '.DS_Store' in filenames:
        filenames.remove('.DS_Store')
    filenames = sorted(filenames)
    dataset = []
    for idx, filename in enumerate(filenames):
        data = (pd.read_csv(data_filepath + filename).values).tolist()
        for single_data in data:
            single_data = preprocessData(single_data)
            dataset.append(single_data)

    testDataset = []
    assert os.path.exists(inference_filepath)
    filenames = os.listdir(inference_filepath)
    if '.DS_Store' in filenames:
        filenames.remove('.DS_Store')
    filenames = sorted(filenames, reverse=True)
    testset = []
    for idx, filename in enumerate(filenames):
        if filename == "inf_1211_to_1217.csv":
            continue
        print(filename)
        data = (pd.read_csv(inference_filepath + filename).values).tolist()
        for single_data in data:
            single_data = preprocessData(single_data)
            testset.append(single_data)
    return dataset, testset

In [30]:
config = {
    "data_filepath": 'dataset_w_with_distance/',
    # "data_filepath": '/kaggle/input/dataset-1201-new/dataset_w_csv/',
    "inference_filepath": 'inference_w_with_distance/',
    # "inference_filepath": '/kaggle/input/inference-1204/inference_csv/',
    # "inference_filepath": '/kaggle/input/inference-new-1204/inference_w_csv/',
    "outputFilename": "results/prediction_RandomForest.csv"
}

In [31]:
def splitFeaturesLabels(dataset, type):
    # month, day, weekday, hr, min, lat, lng, dist_g, dist_k, act, ratio, sbi, tot, title, act_title
    if type == "train":
        features = np.array(dataset[:,:10], dtype=float)
        labels = np.array(dataset[:,10], dtype=float)
        return features, labels
    elif type == "test":
        features = np.array(dataset[:,:10], dtype=float)
        titles = []
        tots = []
        for single_data in dataset:
            tots.append(int(single_data[10]))
            titles.append(single_data[11])
        return features, titles, tots
    else:
        raise NotImplementedError

In [32]:
dataset, testset = readDataset(config['data_filepath'], config['inference_filepath'])
xtrain, ytrain = splitFeaturesLabels(np.array(dataset), "train")
xtest, titles, tots = splitFeaturesLabels(np.array(testset), "test")

xtrain = scale(xtrain)
xtest = scale(xtest)

inf_1204_to_1210.csv
inf_1021_to_1024.csv


In [33]:
xtest[0]

array([ 0.75592895, -1.12823739, -1.3540064 ,  1.66132477,  1.22474487,
        2.0071616 ,  0.84918385,  1.72662541, -2.46255408,  0.12176624])

In [34]:
print(len(xtrain))
print(len(ytrain))
print(len(xtest))
print(len(titles))
print(len(tots))

459984
459984
88704
88704
88704


In [38]:
rfr = RandomForestRegressor(n_estimators=1500, verbose=1, n_jobs=-1)
print(rfr)

RandomForestRegressor(n_estimators=1500, n_jobs=-1, verbose=1)


In [39]:
rfr.fit(xtrain, ytrain)

score = rfr.score(xtrain, ytrain)
print("R-squared:", score) 
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   52.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  9.9min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.6s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   12.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   29.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:   57.3s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:  1.5min


R-squared: 0.9804217014720336


[Parallel(n_jobs=8)]: Done 1500 out of 1500 | elapsed:  1.8min finished


In [40]:
ypreds = rfr.predict(xtest)

assert len(ypreds) == len(titles) == len(tots)
prediction = [['id','sbi']]
for (pred, tot, title) in zip(ypreds, tots, titles):
    prediction.append([title, pred*tot])

with open(config['outputFilename'], 'w', newline='') as file:
# Step 4: Using csv.writer to write the list to the CSV file
    writer = csv.writer(file)
    writer.writerows(prediction) # Use writerows for nested list
    
print("output file written")
        
# mse = mean_squared_error(ytest, ypred)
# print("MSE: ", mse)
# print("RMSE: ", mse*(1/2.0)) 

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    8.9s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   18.7s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:   33.6s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:   54.9s
[Parallel(n_jobs=8)]: Done 1500 out of 1500 | elapsed:  1.1min finished


output file written
