# Capacity_mw

In [110]:
import pandas as pd

In [111]:
from sklearn.model_selection import train_test_split

In [112]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [113]:
from sklearn.impute import SimpleImputer

In [114]:
from sklearn.compose import ColumnTransformer

In [115]:
from sklearn.pipeline import Pipeline

In [116]:
from sklearn.ensemble import RandomForestRegressor

In [117]:
from sklearn.metrics import mean_absolute_error

In [118]:
data_aus = pd.read_csv('database_AUS.csv')
data_ind = pd.read_csv('database_IND.csv')
data_usa = pd.read_csv('database_USA.csv')

In [119]:
data = pd.concat([data_aus, data_ind, data_usa], ignore_index=True)

In [120]:
print('Columns in the combined dataset:')
print(data.columns)

Columns in the combined dataset:
Index(['country', 'country_long', 'name', 'gppd_idnr', 'capacity_mw',
       'latitude', 'longitude', 'primary_fuel', 'other_fuel1', 'other_fuel2',
       'other_fuel3', 'commissioning_year', 'owner', 'source', 'url',
       'geolocation_source', 'wepp_id', 'year_of_capacity_data',
       'generation_gwh_2013', 'generation_gwh_2014', 'generation_gwh_2015',
       'generation_gwh_2016', 'generation_gwh_2017', 'generation_gwh_2018',
       'generation_gwh_2019', 'generation_data_source',
       'estimated_generation_gwh'],
      dtype='object')


In [121]:
columns_to_drop = ['gppd_idnr', 'wepp_id', 'url', 'source', 'geolocation_source', 'generation_data_source', 'estimated_generation_note_2013', 'estimated_generation_note_2014', 'estimated_generation_note_2015', 'estimated_generation_note_2016', 'estimated_generation_note_2017']

In [122]:
columns_to_drop = [col for col in columns_to_drop if col in data.columns]
data.drop(columns=columns_to_drop, inplace=True)

In [123]:
for col in data.columns:
    if data[col].dtype == object:
        data[col] = data[col].astype(str)
else:
    data[col] = pd.to_numeric(data[col], errors='coerce')

In [124]:
data = data.dropna(subset=['capacity_mw'])

In [125]:
data = data.dropna(axis=1, how='all')

In [126]:
x = data.drop(columns='capacity_mw')
y = data['capacity_mw']

In [127]:
numerical_features = x.select_dtypes(include=['number']).columns.tolist()
categorical_features = x.select_dtypes(include=['object']).columns.tolist()

In [128]:
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [129]:
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_features), ('cat', categorical_transformer, categorical_features)])

In [130]:
model = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

In [131]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [132]:
model.fit(x_train, y_train)

In [134]:
y_pred = model.predict(x_test)

In [135]:
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 39.52589868747649
