In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns',None)


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree  import DecisionTreeRegressor

In [3]:
import sklearn
print(sklearn.__version__)

1.3.0


In [4]:
data = pd.read_csv('../data/Processed_DatasetsAmount-of Waste-Generated-By-State 32121-0003.csv')

In [5]:
data.head()

Unnamed: 0,Year,States,Types of Waste,Total Household Waste Generated (Tons),Household Waste Generated per Inhabitant (kg)
0,2004,Baden-Württemberg,Residual household and bulky wastes,1605.6,150.0
1,2004,Baden-Württemberg,Separately collected organic wastes,1220.5,114.0
2,2004,Baden-Württemberg,Separately collected recyclables,1645.3,154.0
3,2004,Baden-Württemberg,Other wastes,9.4,1.0
4,2004,Bayern,Separately collected organic wastes,1677.3,135.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046 entries, 0 to 1045
Data columns (total 5 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Year                                           1046 non-null   int64  
 1   States                                         1046 non-null   object 
 2   Types of Waste                                 1046 non-null   object 
 3   Total Household Waste Generated (Tons)         1046 non-null   float64
 4   Household Waste Generated per Inhabitant (kg)  1046 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 41.0+ KB


In [7]:
data.isnull().sum()

Year                                             0
States                                           0
Types of Waste                                   0
Total Household Waste Generated (Tons)           0
Household Waste Generated per Inhabitant (kg)    0
dtype: int64

In [8]:
data.describe()

Unnamed: 0,Year,Total Household Waste Generated (Tons),Household Waste Generated per Inhabitant (kg)
count,1046.0,1046.0,1046.0
mean,2012.451243,405.893881,110.816904
std,5.18103,462.755575,80.167121
min,2004.0,0.1,1.0
25%,2008.0,29.4,21.0
50%,2012.0,230.3,125.0
75%,2017.0,613.75,163.0
max,2021.0,1799.1,349.0


In [9]:
data.corr()

Unnamed: 0,Year,Total Household Waste Generated (Tons),Household Waste Generated per Inhabitant (kg)
Year,1.0,-0.020062,0.019148
Total Household Waste Generated (Tons),-0.020062,1.0,0.575053
Household Waste Generated per Inhabitant (kg),0.019148,0.575053,1.0


In [10]:
data['Year'].unique()

array([2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017, 2018, 2019, 2020, 2021])

In [11]:
data['States'].unique()

array(['Baden-Württemberg', 'Bayern', 'Berlin', 'Brandenburg', 'Bremen',
       'Hamburg', 'Hessen', 'Mecklenburg-Vorpommern', 'Niedersachsen',
       'Nordrhein-Westfalen', 'Rheinland-Pfalz', 'Saarland', 'Sachsen',
       'Sachsen-Anhalt', 'Schleswig-Holstein', 'Thüringen'], dtype=object)

In [12]:
data['Types of Waste'].unique()

array(['Residual household and bulky wastes',
       'Separately collected organic wastes',
       'Separately collected recyclables', 'Other wastes'], dtype=object)

In [13]:
data['Total Household Waste Generated (Tons)'].unique()

array([1.6056e+03, 1.2205e+03, 1.6453e+03, 9.4000e+00, 1.6773e+03,
       5.9100e+01, 9.7500e+02, 9.9200e+01, 3.9020e+02, 1.5000e+00,
       5.5820e+02, 7.2100e+01, 3.4540e+02, 6.6000e+00, 1.6700e+02,
       5.7000e+01, 9.5000e+01, 2.0000e-01, 3.7300e+01, 1.7020e+02,
       1.9000e+00, 1.2787e+03, 7.2780e+02, 8.1010e+02, 3.3000e+00,
       4.0710e+02, 6.9700e+01, 2.5360e+02, 1.4000e+00, 1.6581e+03,
       1.1052e+03, 1.1605e+03, 1.0100e+01, 9.8100e+01, 8.0410e+02,
       5.3250e+02, 6.4320e+02, 2.1500e+01, 2.6880e+02, 1.6030e+02,
       1.2640e+02, 5.0000e-01, 6.9600e+02, 2.0330e+02, 5.8870e+02,
       2.6000e+00, 6.2530e+02, 1.8330e+02, 3.3840e+02, 2.0800e+01,
       6.8850e+02, 2.8050e+02, 4.1370e+02, 4.9750e+02, 1.4380e+02,
       3.3630e+02, 1.4700e+01, 1.5683e+03, 1.2393e+03, 1.7027e+03,
       8.8000e+00, 1.5971e+03, 5.4100e+01, 9.7890e+02, 1.1380e+02,
       4.0670e+02, 5.6250e+02, 7.2500e+01, 3.5270e+02, 4.1000e+00,
       1.6410e+02, 5.8000e+01, 8.8100e+01, 3.0000e-01, 3.6200e

In [14]:
data['Household Waste Generated per Inhabitant (kg)'].unique()

array([150.        , 114.        , 154.        ,   1.        ,
       135.        ,   5.        , 288.        ,  29.        ,
       115.        , 115.51604278, 217.        ,  28.        ,
         3.        , 252.        ,  86.        , 143.        ,
        21.        ,  98.        , 210.        , 119.        ,
       133.        , 237.        ,  41.        , 147.        ,
       207.        , 138.        , 145.        , 198.        ,
       131.        , 158.        , 254.        , 152.        ,
       120.        , 162.        ,  47.        , 137.        ,
       251.        ,  73.        , 136.        ,   8.        ,
       243.        ,  99.        , 146.        , 211.        ,
        61.        ,   6.        , 159.        , 128.        ,
         4.        ,  34.        , 220.        ,   2.        ,
       247.        ,  88.        ,  94.        , 208.        ,
       116.        , 235.        ,  42.        , 144.        ,
       199.        , 140.        , 129.        , 161.  

In [15]:
data_bulk_waste = data[data['Types of Waste'] == 'Residual household and bulky wastes']

In [16]:
data_bulk_waste.head()

Unnamed: 0,Year,States,Types of Waste,Total Household Waste Generated (Tons),Household Waste Generated per Inhabitant (kg)
0,2004,Baden-Württemberg,Residual household and bulky wastes,1605.6,150.0
6,2004,Berlin,Residual household and bulky wastes,975.0,288.0
10,2004,Brandenburg,Residual household and bulky wastes,558.2,217.0
14,2004,Bremen,Residual household and bulky wastes,167.0,252.0
21,2004,Hessen,Residual household and bulky wastes,1278.7,210.0


In [17]:
data_bulk_waste = data_bulk_waste.rename(columns={"Household Waste Generated per Inhabitant (kg)": "Waste"})

In [18]:
data_bulk_waste.head()

Unnamed: 0,Year,States,Types of Waste,Total Household Waste Generated (Tons),Waste
0,2004,Baden-Württemberg,Residual household and bulky wastes,1605.6,150.0
6,2004,Berlin,Residual household and bulky wastes,975.0,288.0
10,2004,Brandenburg,Residual household and bulky wastes,558.2,217.0
14,2004,Bremen,Residual household and bulky wastes,167.0,252.0
21,2004,Hessen,Residual household and bulky wastes,1278.7,210.0


In [19]:
data_bulk_waste.drop(['Types of Waste','Total Household Waste Generated (Tons)'],axis=1,inplace=True)

In [32]:
data_bulk_waste.head()

Unnamed: 0,Year,States,Waste
0,2004,Baden-Württemberg,150.0
6,2004,Berlin,288.0
10,2004,Brandenburg,217.0
14,2004,Bremen,252.0
21,2004,Hessen,210.0


In [33]:
y = data_bulk_waste['Waste']

In [34]:
y.head()

0     150.0
6     288.0
10    217.0
14    252.0
21    210.0
Name: Waste, dtype: float64

In [35]:
data_bulk_waste.drop(['Waste'],axis=1,inplace=True)

In [36]:
data_bulk_waste

Unnamed: 0,Year,States
0,2004,Baden-Württemberg
6,2004,Berlin
10,2004,Brandenburg
14,2004,Bremen
21,2004,Hessen
...,...,...
1026,2021,Saarland
1030,2021,Sachsen
1034,2021,Sachsen-Anhalt
1038,2021,Schleswig-Holstein


In [38]:
#Step-1
X_train,X_test,y_train,y_test = train_test_split(data_bulk_waste,y,test_size=0.2,random_state=42)

In [39]:
X_train.head()

Unnamed: 0,Year,States
563,2013,Rheinland-Pfalz
943,2020,Bremen
997,2021,Brandenburg
508,2012,Saarland
579,2013,Schleswig-Holstein


In [40]:
y_train.sample(5)

425    245.0
608    189.0
273    240.0
116    148.0
14     252.0
Name: Waste, dtype: float64

In [51]:
numeric_features = ['Year'] 
categorical_features = ['States']

In [52]:
numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler(feature_range=(-1, 1)))])

In [53]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=True, handle_unknown='ignore'))])

In [54]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [55]:
# train the model
random_forest = RandomForestRegressor(random_state=42)
#DecisionTreeRegressor(random_state=42)

In [56]:
from sklearn import set_config
set_config(display='diagram')

In [57]:
model = Pipeline(steps=[
    ('precprocessor', preprocessor),
   # ('feature_selection',feature_selection),
    ('random_forest', random_forest)
])

In [58]:
# train
model.fit(X_train,y_train)

In [59]:
model.named_steps

{'precprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('scaler',
                                                   MinMaxScaler(feature_range=(-1,
                                                                               1)))]),
                                  ['Year']),
                                 ('cat',
                                  Pipeline(steps=[('onehot',
                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  ['States'])]),
 'random_forest': RandomForestRegressor(random_state=42)}

In [60]:
X_test.head()

Unnamed: 0,Year,States
604,2014,Hamburg
29,2004,Niedersachsen
417,2011,Berlin
260,2008,Mecklenburg-Vorpommern
479,2012,Brandenburg


In [61]:
final_predictions = model.predict(X_test)

In [62]:
final_predictions

array([287.22, 208.2 , 256.23, 231.98, 211.63, 242.54, 226.44, 265.14,
       158.51, 192.49, 226.61, 178.81, 139.54, 260.62, 189.75, 213.29,
       139.54, 219.45, 200.37, 236.63, 200.45, 207.16, 297.7 , 144.42,
       148.93, 209.08, 271.68, 226.89, 344.39, 151.65, 188.85, 205.48,
       229.2 , 195.05, 207.75, 252.32, 234.54, 212.22, 139.97, 188.46,
       186.67, 144.69, 228.5 , 178.31, 139.54, 251.18, 184.94, 263.56,
       188.5 , 231.26])

In [63]:
tree_rmse = mean_squared_error(y_test, final_predictions,squared=False)
tree_rmse

5.169061616966857

In [64]:
model.score(X_test,y_test)

0.9866099681355786

In [72]:
X_test.head()

Unnamed: 0,Year,States
604,2014,Hamburg
29,2004,Niedersachsen
417,2011,Berlin
260,2008,Mecklenburg-Vorpommern
479,2012,Brandenburg


In [74]:
states = data['States'].unique()

In [78]:
states

array(['Baden-Württemberg', 'Bayern', 'Berlin', 'Brandenburg', 'Bremen',
       'Hamburg', 'Hessen', 'Mecklenburg-Vorpommern', 'Niedersachsen',
       'Nordrhein-Westfalen', 'Rheinland-Pfalz', 'Saarland', 'Sachsen',
       'Sachsen-Anhalt', 'Schleswig-Holstein', 'Thüringen'], dtype=object)

In [83]:
df_input = pd.DataFrame(states,columns=['States'])

In [84]:
df_input['Year'] = 2022

In [99]:
df_input['Types of Waste'] = 'Residual household and bulky wastes'

In [100]:
df_input

Unnamed: 0,States,Year,Types of Waste
0,Baden-Württemberg,2022,Residual household and bulky wastes
1,Bayern,2022,Residual household and bulky wastes
2,Berlin,2022,Residual household and bulky wastes
3,Brandenburg,2022,Residual household and bulky wastes
4,Bremen,2022,Residual household and bulky wastes
5,Hamburg,2022,Residual household and bulky wastes
6,Hessen,2022,Residual household and bulky wastes
7,Mecklenburg-Vorpommern,2022,Residual household and bulky wastes
8,Niedersachsen,2022,Residual household and bulky wastes
9,Nordrhein-Westfalen,2022,Residual household and bulky wastes


In [101]:
output = model.predict(df_input)

In [102]:
df_predicted = pd.DataFrame(output, columns=['Household Waste Generated per Inhabitant (kg)'])

In [103]:
df_final = pd.concat([df_input,df_predicted],axis=1)

In [104]:
df_final.index = df_final.index+1

In [105]:
df_final

Unnamed: 0,States,Year,Types of Waste,Household Waste Generated per Inhabitant (kg)
1,Baden-Württemberg,2022,Residual household and bulky wastes,139.54
2,Bayern,2022,Residual household and bulky wastes,197.43
3,Berlin,2022,Residual household and bulky wastes,239.57
4,Brandenburg,2022,Residual household and bulky wastes,216.11
5,Bremen,2022,Residual household and bulky wastes,234.26
6,Hamburg,2022,Residual household and bulky wastes,260.62
7,Hessen,2022,Residual household and bulky wastes,180.77
8,Mecklenburg-Vorpommern,2022,Residual household and bulky wastes,231.47
9,Niedersachsen,2022,Residual household and bulky wastes,194.92
10,Nordrhein-Westfalen,2022,Residual household and bulky wastes,197.43
