In [80]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE

##### Read data (drop columns with 1 unique value)

In [81]:
df = pd.read_csv("../datacleanning/clean_clrtap.csv",sep='\t')
df = df.drop(columns=['Country_Code','Notation','VersionId','Format_name'])
df.columns

Index(['Country', 'Pollutant_Name', 'Sector_code', 'Year', 'Emissions',
       'Parent_sector_code', 'Sector_Label', 'Sector_Name'],
      dtype='object')

Fill NaN

In [82]:
df = df.fillna('NONE')

##### Target + Variables

In [83]:
X = df.drop(columns='Emissions')
y = df.Emissions
X.head()

Unnamed: 0,Country,Pollutant_Name,Sector_code,Year,Parent_sector_code,Sector_Label,Sector_Name
0,Finland,As,NATIONAL TOTAL,1990,NONE,National total for the entire territory (based...,National total for the entire territory
1,Finland,As,1A3di(i),1990,NONE,International maritime navigation,Other activities
2,France,As,1A1a,1990,NATIONAL TOTAL,Public electricity and heat production,Energy sector
3,France,As,1A1b,1990,NATIONAL TOTAL,Petroleum refining,Energy sector
4,France,As,1A1c,1990,NATIONAL TOTAL,Manufacture of solid fuels and other energy in...,Energy sector


##### Encoding

In [84]:
c = (X.dtypes == 'object')
categorical_columns = list(c[c].index)
print(categorical_columns)

ordinal_encoder = OrdinalEncoder()
X[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])
X.head()

['Country', 'Pollutant_Name', 'Sector_code', 'Parent_sector_code', 'Sector_Label', 'Sector_Name']


Unnamed: 0,Country,Pollutant_Name,Sector_code,Year,Parent_sector_code,Sector_Label,Sector_Name
0,10.0,0.0,135.0,1990,1.0,75.0,4.0
1,10.0,0.0,26.0,1990,1.0,52.0,6.0
2,11.0,0.0,3.0,1990,0.0,102.0,2.0
3,11.0,0.0,4.0,1990,0.0,98.0,2.0
4,11.0,0.0,5.0,1990,0.0,57.0,2.0


##### Standardize

In [25]:
"""scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)"""

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction ** 2 / new_sample_count


[[-0.62942342 -1.7731201   0.         ...         nan  0.02390107
   0.20614074]
 [-0.62942342 -1.7731201   0.         ...         nan -0.55348515
   1.00579523]
 [-0.53112721 -1.7731201   0.         ...  0.          0.70170228
  -0.59351376]
 ...
 [ 1.72968551  1.48330341  0.         ...  0.          0.95273977
   1.40562247]
 [ 1.72968551  1.48330341  0.         ...  0.          1.00294727
   1.40562247]
 [ 1.72968551  1.48330341  0.         ...  0.          0.90253227
   1.40562247]]


##### Split Data

In [85]:
SEED = 1
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=SEED)

##### Random Forest Model Prediction

In [89]:
"""rf = RandomForestRegressor(min_samples_leaf=0.12,random_state=SEED)

parameters = {"n_estimators":range(200,800,200)}
cv = GridSearchCV(rf, param_grid=parameters)

cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)
print("Best estimator: {}".format(cv.best_params_))"""

KeyboardInterrupt: 

In [92]:
rf = RandomForestRegressor(n_estimators=400,min_samples_leaf=0.12,random_state=SEED)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

##### RMSE

In [94]:
rmse_test = np.sqrt(MSE(y_test, y_pred))
print('RMSE : {:.2f}'.format(rmse_test))
print('MAE : ', MAE(y_test,y_pred))

RMSE : 279.21
MAE :  24.542930680491388
