In [42]:
# ! pip install scikit-learn 
# ! pip install numpy
# ! pip install pandas

In [43]:
import os
import pandas as pd
import numpy as np
import csv
import math

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn import set_config 

from sklearn.tree import export_graphviz

In [1]:
def preprocessData(singleData):
    singleData[5] -= 24.5
    singleData[6] -= 121
    if singleData[3] == 0:
        singleData[3] = 24
    if singleData[4] == 0:
        singleData[4] = 60
    singleData[4] /= 60
    # singleData[7] = min(singleData[7], singleData[8])
    # singleData.pop(8)
    
    return singleData

def readDataset(data_filepath, inference_filepath):
    assert os.path.exists(data_filepath)
    filenames = os.listdir(data_filepath)
    if '.DS_Store' in filenames:
        filenames.remove('.DS_Store')
    filenames = sorted(filenames)
    dataset = []
    for idx, filename in enumerate(filenames):
        data = (pd.read_csv(data_filepath + filename).values).tolist()
        for single_data in data:
            single_data = preprocessData(single_data)
            dataset.append(single_data)

    testDataset = []
    assert os.path.exists(inference_filepath)
    filenames = os.listdir(inference_filepath)
    if '.DS_Store' in filenames:
        filenames.remove('.DS_Store')
    filenames = sorted(filenames, reverse=True)
    testset = []
    for idx, filename in enumerate(filenames):
        if filename == "inf_1211_to_1217.csv":
            continue
        print(filename)
        data = (pd.read_csv(inference_filepath + filename).values).tolist()
        for single_data in data:
            single_data = preprocessData(single_data)
            testset.append(single_data)
    return dataset, testset

In [None]:
config = {
    "data_filepath": 'dataset_w_csv/',
    # "data_filepath": '/kaggle/input/dataset-1201-new/dataset_w_csv/',
    "inference_filepath": 'inference_w_csv/',
    # "inference_filepath": '/kaggle/input/inference-1204/inference_csv/',
    # "inference_filepath": '/kaggle/input/inference-new-1204/inference_w_csv/',
    "outputFilename": "results/prediction_RandomForest.csv"
}

In [None]:
def splitFeaturesLabels(dataset, type):
    # month, day, weekday, hr, min, lat, lng, dist_g, dist_k, act, ratio, sbi, tot, title, act_title
    if type == "train":
        features = np.array(dataset[:,:8], dtype=float)
        labels = np.array(dataset[:,8], dtype=float)
        return features, labels
    elif type == "test":
        features = np.array(dataset[:,:8], dtype=float)
        titles = []
        tots = []
        for single_data in dataset:
            tots.append(int(single_data[8]))
            titles.append(single_data[9])
        return features, titles, tots
    else:
        raise NotImplementedError

In [None]:
dataset, testset = readDataset(config['data_filepath'], config['inference_filepath'])
xtrain, ytrain = splitFeaturesLabels(np.array(dataset), "train")
xtest, titles, tots = splitFeaturesLabels(np.array(testset), "test")

xtrain = scale(xtrain)
xtest = scale(xtest)

In [None]:
xtest[0]

In [None]:
print(len(xtrain))
print(len(ytrain))
print(len(xtest))
print(len(titles))
print(len(tots))

In [None]:
rfr = RandomForestRegressor(n_estimators=1300, verbose=1, n_jobs=-1)
print(rfr)

In [None]:
rfr.fit(xtrain, ytrain)

score = rfr.score(xtrain, ytrain)
print("R-squared:", score) 
 

In [None]:
ypreds = rfr.predict(xtest)

assert len(ypreds) == len(titles) == len(tots)
prediction = [['id','sbi']]
for (pred, tot, title) in zip(ypreds, tots, titles):
    prediction.append([title, pred*tot])

with open(config['outputFilename'], 'w', newline='') as file:
# Step 4: Using csv.writer to write the list to the CSV file
    writer = csv.writer(file)
    writer.writerows(prediction) # Use writerows for nested list
    
print("output file written")
        
# mse = mean_squared_error(ytest, ypred)
# print("MSE: ", mse)
# print("RMSE: ", mse*(1/2.0)) 