In [50]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns',None)


In [51]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree  import DecisionTreeRegressor

In [52]:
import sklearn
print(sklearn.__version__)

1.3.0


In [53]:
data = pd.read_csv('../data/Processed_DatasetsAmount-of Waste-Generated-By-State 32121-0003.csv')

In [54]:
data.head()

Unnamed: 0,Year,States,Types of Waste,Total Household Waste Generated (Tons),Household Waste Generated per Inhabitant (kg)
0,2004,Baden-Württemberg,Residual household and bulky wastes,1605.6,150.0
1,2004,Baden-Württemberg,Separately collected organic wastes,1220.5,114.0
2,2004,Baden-Württemberg,Separately collected recyclables,1645.3,154.0
3,2004,Baden-Württemberg,Other wastes,9.4,1.0
4,2004,Bayern,Separately collected organic wastes,1677.3,135.0


In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046 entries, 0 to 1045
Data columns (total 5 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Year                                           1046 non-null   int64  
 1   States                                         1046 non-null   object 
 2   Types of Waste                                 1046 non-null   object 
 3   Total Household Waste Generated (Tons)         1046 non-null   float64
 4   Household Waste Generated per Inhabitant (kg)  1046 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 41.0+ KB


In [56]:
data.isnull().sum()

Year                                             0
States                                           0
Types of Waste                                   0
Total Household Waste Generated (Tons)           0
Household Waste Generated per Inhabitant (kg)    0
dtype: int64

In [57]:
data.describe()

Unnamed: 0,Year,Total Household Waste Generated (Tons),Household Waste Generated per Inhabitant (kg)
count,1046.0,1046.0,1046.0
mean,2012.451243,405.893881,110.816904
std,5.18103,462.755575,80.167121
min,2004.0,0.1,1.0
25%,2008.0,29.4,21.0
50%,2012.0,230.3,125.0
75%,2017.0,613.75,163.0
max,2021.0,1799.1,349.0


In [58]:
data.corr()

Unnamed: 0,Year,Total Household Waste Generated (Tons),Household Waste Generated per Inhabitant (kg)
Year,1.0,-0.020062,0.019148
Total Household Waste Generated (Tons),-0.020062,1.0,0.575053
Household Waste Generated per Inhabitant (kg),0.019148,0.575053,1.0


In [59]:
data['Year'].unique()

array([2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017, 2018, 2019, 2020, 2021])

In [60]:
data['States'].unique()

array(['Baden-Württemberg', 'Bayern', 'Berlin', 'Brandenburg', 'Bremen',
       'Hamburg', 'Hessen', 'Mecklenburg-Vorpommern', 'Niedersachsen',
       'Nordrhein-Westfalen', 'Rheinland-Pfalz', 'Saarland', 'Sachsen',
       'Sachsen-Anhalt', 'Schleswig-Holstein', 'Thüringen'], dtype=object)

In [61]:
data['Types of Waste'].unique()

array(['Residual household and bulky wastes',
       'Separately collected organic wastes',
       'Separately collected recyclables', 'Other wastes'], dtype=object)

In [62]:
data['Total Household Waste Generated (Tons)'].unique()

array([1.6056e+03, 1.2205e+03, 1.6453e+03, 9.4000e+00, 1.6773e+03,
       5.9100e+01, 9.7500e+02, 9.9200e+01, 3.9020e+02, 1.5000e+00,
       5.5820e+02, 7.2100e+01, 3.4540e+02, 6.6000e+00, 1.6700e+02,
       5.7000e+01, 9.5000e+01, 2.0000e-01, 3.7300e+01, 1.7020e+02,
       1.9000e+00, 1.2787e+03, 7.2780e+02, 8.1010e+02, 3.3000e+00,
       4.0710e+02, 6.9700e+01, 2.5360e+02, 1.4000e+00, 1.6581e+03,
       1.1052e+03, 1.1605e+03, 1.0100e+01, 9.8100e+01, 8.0410e+02,
       5.3250e+02, 6.4320e+02, 2.1500e+01, 2.6880e+02, 1.6030e+02,
       1.2640e+02, 5.0000e-01, 6.9600e+02, 2.0330e+02, 5.8870e+02,
       2.6000e+00, 6.2530e+02, 1.8330e+02, 3.3840e+02, 2.0800e+01,
       6.8850e+02, 2.8050e+02, 4.1370e+02, 4.9750e+02, 1.4380e+02,
       3.3630e+02, 1.4700e+01, 1.5683e+03, 1.2393e+03, 1.7027e+03,
       8.8000e+00, 1.5971e+03, 5.4100e+01, 9.7890e+02, 1.1380e+02,
       4.0670e+02, 5.6250e+02, 7.2500e+01, 3.5270e+02, 4.1000e+00,
       1.6410e+02, 5.8000e+01, 8.8100e+01, 3.0000e-01, 3.6200e

In [63]:
data['Household Waste Generated per Inhabitant (kg)'].unique()

array([150.        , 114.        , 154.        ,   1.        ,
       135.        ,   5.        , 288.        ,  29.        ,
       115.        , 115.51604278, 217.        ,  28.        ,
         3.        , 252.        ,  86.        , 143.        ,
        21.        ,  98.        , 210.        , 119.        ,
       133.        , 237.        ,  41.        , 147.        ,
       207.        , 138.        , 145.        , 198.        ,
       131.        , 158.        , 254.        , 152.        ,
       120.        , 162.        ,  47.        , 137.        ,
       251.        ,  73.        , 136.        ,   8.        ,
       243.        ,  99.        , 146.        , 211.        ,
        61.        ,   6.        , 159.        , 128.        ,
         4.        ,  34.        , 220.        ,   2.        ,
       247.        ,  88.        ,  94.        , 208.        ,
       116.        , 235.        ,  42.        , 144.        ,
       199.        , 140.        , 129.        , 161.  

In [64]:
data.head()

Unnamed: 0,Year,States,Types of Waste,Total Household Waste Generated (Tons),Household Waste Generated per Inhabitant (kg)
0,2004,Baden-Württemberg,Residual household and bulky wastes,1605.6,150.0
1,2004,Baden-Württemberg,Separately collected organic wastes,1220.5,114.0
2,2004,Baden-Württemberg,Separately collected recyclables,1645.3,154.0
3,2004,Baden-Württemberg,Other wastes,9.4,1.0
4,2004,Bayern,Separately collected organic wastes,1677.3,135.0


In [65]:
y = data['Household Waste Generated per Inhabitant (kg)']

In [66]:
y.head()

0    150.0
1    114.0
2    154.0
3      1.0
4    135.0
Name: Household Waste Generated per Inhabitant (kg), dtype: float64

In [67]:
data_X = data

In [68]:
data_X.drop(['Household Waste Generated per Inhabitant (kg)','Total Household Waste Generated (Tons)'],axis=1,inplace=True)

In [69]:
data_X

Unnamed: 0,Year,States,Types of Waste
0,2004,Baden-Württemberg,Residual household and bulky wastes
1,2004,Baden-Württemberg,Separately collected organic wastes
2,2004,Baden-Württemberg,Separately collected recyclables
3,2004,Baden-Württemberg,Other wastes
4,2004,Bayern,Separately collected organic wastes
...,...,...,...
1041,2021,Schleswig-Holstein,Other wastes
1042,2021,Thüringen,Residual household and bulky wastes
1043,2021,Thüringen,Separately collected organic wastes
1044,2021,Thüringen,Separately collected recyclables


In [70]:
#Step-1
X_train,X_test,y_train,y_test = train_test_split(data_X,y,test_size=0.2,random_state=42)

In [71]:
X_train.head()

Unnamed: 0,Year,States,Types of Waste
256,2008,Hessen,Residual household and bulky wastes
351,2009,Thüringen,Other wastes
60,2005,Baden-Württemberg,Separately collected recyclables
344,2009,Schleswig-Holstein,Residual household and bulky wastes
381,2010,Mecklenburg-Vorpommern,Other wastes


In [72]:
y_train.sample(5)

537      1.0
560    164.0
650    247.0
619      2.0
584     86.0
Name: Household Waste Generated per Inhabitant (kg), dtype: float64

In [73]:
numeric_features = ['Year'] 
categorical_features = ['States','Types of Waste']

In [74]:
numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler(feature_range=(-1, 1)))])

In [75]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=True, handle_unknown='ignore'))])

In [76]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [77]:
# train the model
random_forest = RandomForestRegressor(random_state=42)
#DecisionTreeRegressor(random_state=42)

In [78]:
from sklearn import set_config
set_config(display='diagram')

In [79]:
model = Pipeline(steps=[
    ('precprocessor', preprocessor),
   # ('feature_selection',feature_selection),
    ('random_forest', random_forest)
])

In [80]:
# train
model.fit(X_train,y_train)

In [81]:
model.named_steps

{'precprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('scaler',
                                                   MinMaxScaler(feature_range=(-1,
                                                                               1)))]),
                                  ['Year']),
                                 ('cat',
                                  Pipeline(steps=[('onehot',
                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  ['States', 'Types of Waste'])]),
 'random_forest': RandomForestRegressor(random_state=42)}

In [82]:
X_test.head()

Unnamed: 0,Year,States,Types of Waste
773,2017,Bremen,Separately collected organic wastes
280,2008,Sachsen,Other wastes
629,2014,Sachsen,Residual household and bulky wastes
318,2009,Hessen,Other wastes
1008,2021,Hamburg,Other wastes


In [83]:
final_predictions = model.predict(X_test)

In [84]:
final_predictions

array([ 86.99      ,   1.12      , 149.78      ,  73.14510695,
         2.72      , 133.69      , 137.88      ,  68.15      ,
       205.78      ,   1.09      , 130.23      ,  96.73      ,
       110.3       , 122.81      ,  48.95      , 213.65      ,
       184.15      , 242.09      ,   1.        ,   6.06      ,
       179.55      , 152.06      ,   5.47      , 216.46      ,
        12.45160428, 152.72      , 149.48      , 118.47      ,
        77.07      , 136.14      ,   1.19      , 145.02      ,
        12.47160428, 139.51      , 232.18      , 234.88      ,
       248.39      , 232.11      , 166.3       ,  67.85      ,
         2.1       ,  47.92      , 203.27      , 120.21      ,
       136.45      , 133.23      ,   3.96      , 115.12      ,
       187.09      ,  20.87      ,   5.02      ,  33.32      ,
       172.17      , 115.51604278,   2.44      , 149.54      ,
       157.37      , 182.47      , 151.44      , 210.29      ,
        59.53      , 226.74      , 184.98      ,  49.89

In [85]:
tree_rmse = mean_squared_error(y_test, final_predictions,squared=False)
tree_rmse

11.842376092529536

In [86]:
model.score(X_test,y_test)

0.9755168502887748

# Cross Validation using Pipeline

In [87]:
from sklearn.model_selection import cross_val_score
forest_rmses = -cross_val_score(model, X_train,y_train,
                                scoring="neg_root_mean_squared_error", cv=10)

In [88]:
pd.Series(forest_rmses).describe()

count    10.000000
mean     10.396067
std       4.623706
min       3.755082
25%       6.696985
50%       9.632928
75%      13.866939
max      18.255821
dtype: float64

In [89]:
model.fit(X_train,y_train)
final_predictions = model.predict(X_test)
forest_rmse = mean_squared_error(y_test, final_predictions,squared=False)
forest_rmse

11.842376092529536

# Fine-Tune Your Model
Grid Search

In [90]:
from sklearn.model_selection import GridSearchCV

grid_params = [
    {
     "random_forest__max_depth":[1,2,3,4,5,None]
    }
]
grid_search = GridSearchCV(model,grid_params,cv=5,scoring='neg_root_mean_squared_error')

In [91]:
grid_search.fit(X_train,y_train)

In [92]:

grid_search.best_score_

-10.891500327909936

In [93]:
final_predictions_cv = model.predict(X_test)
forest_rmse_cv = mean_squared_error(y_test, final_predictions_cv,squared=False)
forest_rmse_cv

11.842376092529536

# Exporting the Pipeline

In [94]:
# export 
import pickle

In [95]:
pickle.dump(model,open('wm-model.pkl','wb'))

In [96]:
X_test.head()

Unnamed: 0,Year,States,Types of Waste
773,2017,Bremen,Separately collected organic wastes
280,2008,Sachsen,Other wastes
629,2014,Sachsen,Residual household and bulky wastes
318,2009,Hessen,Other wastes
1008,2021,Hamburg,Other wastes


In [97]:
states = data['States'].unique()

In [98]:
states

array(['Baden-Württemberg', 'Bayern', 'Berlin', 'Brandenburg', 'Bremen',
       'Hamburg', 'Hessen', 'Mecklenburg-Vorpommern', 'Niedersachsen',
       'Nordrhein-Westfalen', 'Rheinland-Pfalz', 'Saarland', 'Sachsen',
       'Sachsen-Anhalt', 'Schleswig-Holstein', 'Thüringen'], dtype=object)

In [99]:
df_input = pd.DataFrame(states,columns=['States'])

In [100]:
df_input['Year'] = 2022

In [101]:
df_input['Types of Waste'] = 'Residual household and bulky wastes'

In [102]:
df_input

Unnamed: 0,States,Year,Types of Waste
0,Baden-Württemberg,2022,Residual household and bulky wastes
1,Bayern,2022,Residual household and bulky wastes
2,Berlin,2022,Residual household and bulky wastes
3,Brandenburg,2022,Residual household and bulky wastes
4,Bremen,2022,Residual household and bulky wastes
5,Hamburg,2022,Residual household and bulky wastes
6,Hessen,2022,Residual household and bulky wastes
7,Mecklenburg-Vorpommern,2022,Residual household and bulky wastes
8,Niedersachsen,2022,Residual household and bulky wastes
9,Nordrhein-Westfalen,2022,Residual household and bulky wastes


In [103]:
output = model.predict(df_input)

In [104]:
df_predicted = pd.DataFrame(output, columns=['Household Waste Generated per Inhabitant (kg)'])

In [105]:
df_final_bulk = pd.concat([df_input,df_predicted],axis=1)

In [107]:
df_final_bulk.index = df_final_bulk.index+1

In [269]:
df_final_bulk

Unnamed: 0,States,Year,Types of Waste,Household Waste Generated per Inhabitant (kg)
2,Baden-Württemberg,2022,Residual household and bulky wastes,143.61
3,Bayern,2022,Residual household and bulky wastes,189.68
4,Berlin,2022,Residual household and bulky wastes,239.47
5,Brandenburg,2022,Residual household and bulky wastes,218.42
6,Bremen,2022,Residual household and bulky wastes,234.4
7,Hamburg,2022,Residual household and bulky wastes,256.67
8,Hessen,2022,Residual household and bulky wastes,176.26
9,Mecklenburg-Vorpommern,2022,Residual household and bulky wastes,231.0
10,Niedersachsen,2022,Residual household and bulky wastes,190.09
11,Nordrhein-Westfalen,2022,Residual household and bulky wastes,189.68


In [270]:
df_input_rec = df_input

In [271]:
df_input_rec['Types of Waste'] ='Separately collected recyclables'

In [272]:
output_rec = model.predict(df_input_rec)

In [273]:
df_predicted_rec = pd.DataFrame(output_rec, columns=['Household Waste Generated per Inhabitant (kg)'])

In [274]:
df_final_rec = pd.concat([df_input_rec,df_predicted_rec],axis=1)

In [275]:
df_final_rec.index = df_final_rec.index+1

In [276]:
df_final_rec

Unnamed: 0,States,Year,Types of Waste,Household Waste Generated per Inhabitant (kg)
1,Baden-Württemberg,2022,Separately collected recyclables,162.97
2,Bayern,2022,Separately collected recyclables,141.94
3,Berlin,2022,Separately collected recyclables,106.8
4,Brandenburg,2022,Separately collected recyclables,143.67
5,Bremen,2022,Separately collected recyclables,122.1
6,Hamburg,2022,Separately collected recyclables,119.58
7,Hessen,2022,Separately collected recyclables,138.48
8,Mecklenburg-Vorpommern,2022,Separately collected recyclables,141.19
9,Niedersachsen,2022,Separately collected recyclables,166.34
10,Nordrhein-Westfalen,2022,Separately collected recyclables,141.94


In [None]:
df[['city'] == 'Hamburg] 