In [1]:
#importing essential libraries

import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [2]:
#loading the training data into a dataframe

data = pd.read_csv("https://raw.githubusercontent.com/vrindamathur1428/DataSets/master/TimeForecasting_train.csv")
data.head()

Unnamed: 0,id,time,feature_1,feature_2
0,0,2019-03-19 00:00:00,735.740043,54479.540513
1,1,2019-03-19 00:00:10,734.102947,47888.033714
2,2,2019-03-19 00:00:20,730.060336,47700.882325
3,3,2019-03-19 00:00:30,725.609742,47790.094648
4,4,2019-03-19 00:00:40,724.32848,47808.402381


In [3]:
#here we can see the data types of all columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564 entries, 0 to 563
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         564 non-null    int64  
 1   time       564 non-null    object 
 2   feature_1  564 non-null    float64
 3   feature_2  564 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 17.8+ KB


In [4]:
#modifying the data such that the datatype of "time" column is changed from string object to date-time object
#this will help in making calculations easier

dateparse = lambda dates: pd.datetime.strptime(dates, "%Y-%m-%d %H:%M:%S")
data = pd.read_csv("https://raw.githubusercontent.com/vrindamathur1428/DataSets/master/TimeForecasting_train.csv", parse_dates=['time'], date_parser=dateparse)

  after removing the cwd from sys.path.


In [5]:
#converting the date-time object into float object
#because random forest regressor takes only float variables as input(and not categorical)

def datetime_to_float(d):
    return d.timestamp()

for i in range(len(data["time"])):
    data.loc[i, "time"]=datetime_to_float(data.loc[i, "time"])

In [6]:
#this is what modified data looks like

data.head()

Unnamed: 0,id,time,feature_1,feature_2
0,0,1552950000.0,735.740043,54479.540513
1,1,1552950000.0,734.102947,47888.033714
2,2,1552950000.0,730.060336,47700.882325
3,3,1552950000.0,725.609742,47790.094648
4,4,1552950000.0,724.32848,47808.402381


In [7]:
#here we can see that the data type of "time" column is changed

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564 entries, 0 to 563
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         564 non-null    int64  
 1   time       564 non-null    object 
 2   feature_1  564 non-null    float64
 3   feature_2  564 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 17.8+ KB


In [8]:
#separating the data into features(variables) and labels(targets)

x=data.loc[:, ["time", "feature_1"]] #featuers
y=data["feature_2"] #labels

In [9]:
#Standardizing the training data: Scale the data to be between -1 and 1
scaler = StandardScaler()
scaler.fit(x)
x= scaler.transform(x)

In [10]:
#creating an object of random forest regression model and training it on training data
regressor = RandomForestRegressor(n_estimators = 50, max_depth=20, random_state = 0) 
X=np.array(x)
Y=np.array(y)
regressor.fit(X, Y)

RandomForestRegressor(max_depth=20, n_estimators=50, random_state=0)

In [11]:
#loading test data into a new dataframe

testData=pd.read_csv("https://raw.githubusercontent.com/vrindamathur1428/DataSets/master/TimeForecasting_test.csv")
testData.head()

Unnamed: 0,id,time,feature_1
0,564,2019-03-19 01:34:00,423.064004
1,565,2019-03-19 01:34:10,423.342749
2,566,2019-03-19 01:34:20,423.181186
3,567,2019-03-19 01:34:30,421.275243
4,568,2019-03-19 01:34:40,422.208444


In [12]:
#modifying the test data like we did the training data

dateparse = lambda dates: pd.datetime.strptime(dates, "%Y-%m-%d %H:%M:%S")
testData = pd.read_csv("https://raw.githubusercontent.com/vrindamathur1428/DataSets/master/TimeForecasting_test.csv", parse_dates=['time'], date_parser=dateparse)

  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
for i in range(len(testData["time"])):
    testData.loc[i, "time"]=datetime_to_float(testData.loc[i, "time"])

In [14]:
testData.head()

Unnamed: 0,id,time,feature_1
0,564,1552960000.0,423.064004
1,565,1552960000.0,423.342749
2,566,1552960000.0,423.181186
3,567,1552960000.0,421.275243
4,568,1552960000.0,422.208444


In [15]:
#storing the feature vectors in "x_test" variable
x_test=testData.loc[:, ["time", "feature_1"]]

#variable "id" will store the corresponding ids to assign to the predicted output
ids=testData["id"]

In [16]:
#the same scaling and shifting operations will be applied to the test data to be consistent with the transformation performed on the train data:
x_test= scaler.transform(x_test)


In [17]:
#this is what scaled test data looks like
x_test

array([[ 1.73512455, -1.42935774],
       [ 1.74126658, -1.42540081],
       [ 1.74740861, -1.42769428],
       [ 1.75355065, -1.4547501 ],
       [ 1.75969268, -1.44150284],
       [ 1.76583472, -1.43571377],
       [ 1.77197675, -1.39009635],
       [ 1.77811878, -1.38421851],
       [ 1.78426082, -1.38707477],
       [ 1.79040285, -1.40566386],
       [ 1.79654488, -1.4059969 ],
       [ 1.80268692, -1.54630962],
       [ 1.80882895, -1.41759013],
       [ 1.81497099, -1.4293247 ],
       [ 1.82111302, -1.43289807],
       [ 1.82725505, -1.43571004],
       [ 1.83339709, -1.43992626],
       [ 1.83953912, -1.44644915],
       [ 1.84568116, -1.45513535],
       [ 1.85182319, -1.67915983],
       [ 1.85796522, -1.47647258],
       [ 1.86410726, -1.46230474],
       [ 1.87024929, -1.46568713],
       [ 1.87639132, -1.48022847],
       [ 1.88253336, -1.48502637],
       [ 1.88867539, -1.49094817],
       [ 1.89481743, -1.49813917],
       [ 1.90095946, -1.50314876],
       [ 1.90710149,

In [18]:
#predicting the "feature_2" value using the Random forest regression model created above

X_test=np.array(x_test)
y_pred = regressor.predict(X_test)

In [19]:
#predicted values

y_pred

array([53930.97907935, 53930.97907935, 53930.97907935, 53936.36678153,
       53936.36678153, 53931.1228025 , 53952.44998984, 53946.96792919,
       53946.96792919, 53954.10705564, 53954.10705564, 53940.28063127,
       53951.86745693, 53930.97907935, 53931.1228025 , 53931.1228025 ,
       53931.1228025 , 53936.36678153, 53936.36678153, 53940.28063127,
       53940.28063127, 53936.36678153, 53940.28063127, 53940.28063127,
       53940.28063127, 53940.28063127, 53940.28063127, 53940.28063127,
       53940.28063127, 53940.28063127, 53940.28063127, 53940.28063127,
       53940.28063127, 53940.28063127, 53940.28063127, 53940.28063127,
       53940.28063127, 53940.28063127, 53940.28063127, 53940.28063127,
       53940.28063127, 53940.28063127, 53940.28063127, 53940.28063127,
       53940.28063127, 53940.28063127, 53940.28063127, 53940.28063127,
       53940.28063127, 53940.28063127, 53940.28063127, 53940.28063127,
       53940.28063127, 53940.28063127, 53940.28063127, 53940.28063127,
      

In [20]:
len(y_pred)

375

In [21]:
testresult=pd.DataFrame({"id": ids, "feature_2":y_pred})
testresult

Unnamed: 0,id,feature_2
0,564,53930.979079
1,565,53930.979079
2,566,53930.979079
3,567,53936.366782
4,568,53936.366782
...,...,...
370,934,53976.954458
371,935,53976.954149
372,936,53833.371931
373,937,53844.371357


In [22]:
testresult.to_csv("solution.csv", index=False) #stored in the same directory as the python notebook