# preprocess training data

In [1]:
import pandas as pd

# get the training dataset
train_data = pd.read_csv("./train.csv")

In [2]:
train_data.dropna(inplace=True)
train_data.reset_index(drop=True, inplace=True)

In [3]:
train_data["purchase_date"] = pd.to_datetime(train_data["purchase_date"])
train_data["release_date"] = pd.to_datetime(train_data["release_date"])

In [4]:
train_data['purchase_Year'] = train_data['purchase_date'].dt.year 
train_data['purchase_Month'] = train_data['purchase_date'].dt.month 
train_data.drop(columns=["id", "tags", "purchase_date", "release_date", "genres", "categories", "is_free", "price"], inplace=True)
train_data.head()

Unnamed: 0,playtime_forever,total_positive_reviews,total_negative_reviews,purchase_Year,purchase_Month
0,0.0,372.0,96.0,2018,7
1,0.016667,23.0,0.0,2016,11
2,0.0,3018.0,663.0,2018,7
3,1.533333,63078.0,1746.0,2016,11
4,22.333333,8841.0,523.0,2018,3


In [5]:
train_data[["total_positive_reviews", "total_negative_reviews"]] = train_data[["total_positive_reviews", "total_negative_reviews"]].astype("int")

# preprocess test data

In [6]:
import pandas as pd

test_data = pd.read_csv("./test.csv")

In [7]:
test_data["purchase_date"] = pd.to_datetime(test_data["purchase_date"])
test_data["release_date"] = pd.to_datetime(test_data["release_date"])

In [8]:
test_data['purchase_Year'] = test_data['purchase_date'].dt.year 
test_data['purchase_Month'] = test_data['purchase_date'].dt.month 
test_data.drop(columns=["tags", "purchase_date", "release_date", "genres", "categories", "is_free", "price"], inplace=True)
test_data.head()

Unnamed: 0,id,total_positive_reviews,total_negative_reviews,purchase_Year,purchase_Month
0,0,2607.0,1122.0,2018.0,10.0
1,1,5762.0,2235.0,2019.0,7.0
2,2,687.0,133.0,2019.0,7.0
3,3,67.0,39.0,2018.0,1.0
4,4,40344.0,3708.0,2017.0,2.0


In [9]:
test_data["total_positive_reviews"].fillna(test_data["total_positive_reviews"].mean(), inplace=True)
test_data["total_negative_reviews"].fillna(test_data["total_negative_reviews"].mean(), inplace=True)
test_data["purchase_Year"].fillna(test_data["purchase_Year"].median(), inplace=True)
test_data["purchase_Month"].fillna(test_data["purchase_Month"].median(), inplace=True)
test_data.isnull().any()

id                        False
total_positive_reviews    False
total_negative_reviews    False
purchase_Year             False
purchase_Month            False
dtype: bool

In [10]:
test_data[["total_positive_reviews", "total_negative_reviews", "purchase_Year", "purchase_Month"]] = test_data[["total_positive_reviews", "total_negative_reviews", "purchase_Year", "purchase_Month"]].astype("int")

# Fit model

In [11]:
train_data_Y = train_data["playtime_forever"]
train_data_X = train_data.drop(columns=["playtime_forever"], inplace=False)

In [12]:
test_data_X = test_data.drop(columns=["id"], inplace=False)

In [13]:
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

model = lgb.LGBMRegressor(objective='regression', n_estimators=20)
model.fit(train_data_X, train_data_Y)

test = model.predict(test_data_X)
test_data["playtime_forever"] = test
predict_data = test_data[["id", "playtime_forever"]]
predict_data.head()

Unnamed: 0,id,playtime_forever
0,0,1.06376
1,1,1.282527
2,2,0.650554
3,3,1.313096
4,4,10.157902


In [14]:
predict_data.to_csv("./final_2.csv", index=False)