In [1]:
import pandas as pd 
import sqlalchemy
from sqlalchemy import create_engine
import pickle
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
DB_CONFIG = {
        'host': 'localhost',
        'user': 'root',
        'password': 'root',
        'database': 'aiddata_personal',
        'port': 3306
    }

# Create the connection string
connection_string = f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"

# Create the SQLAlchemy engine
engine = create_engine(connection_string)

In [3]:
df = pd.read_sql_query("""
SELECT p.aid_data_record_id, p.aid_data_parent_id, p.title, p.completion_year, c.name, p.description
FROM sectors as s 
JOIN projects as p ON p.sector_id = s.id
JOIN countries as c ON p.recipient_country_id = c.id
JOIN flow_types as f ON p.flow_type_id = f.id
JOIN status_types as st ON p.status_id = st.id
WHERE s.name = 'ENERGY'
AND p.recommended_for_aggregates = 1
AND f.simplified != 'Debt Rescheduling'
AND st.status != 'Cancelled'
AND p.completion_year IS NOT NULL
AND p.description LIKE %(pattern)s
""", con=engine, params={'pattern': '%MW%'})

In [4]:
int = ['aid_data_record_id','aid_data_parent_id']
df[int] = df[int].astype('Int64')

In [5]:
#make a new column for mw, slice out the number before MW
df['mw'] = None
df['mw'] = df['description'].str.extract(r'(\d+\.?\d*)\s*MW', expand=False)
df['mw'] = pd.to_numeric(df['mw'], errors='coerce')
count_title_mw = df['mw'].notnull().sum()
print(f"Number of records under the column 'description' that have MW: {count_title_mw}")

Number of records under the column 'description' that have MW: 368


In [6]:
# how many recrods under the column tile have mw
# Extract MW from the 'title' column and count non-null values
df['title_mw'] = df['title'].str.extract(r'(\d+\.?\d*)\s*MW', expand=False)
df['title_mw'] = pd.to_numeric(df['title_mw'], errors='coerce')
count_title_mw = df['title_mw'].notnull().sum()
print(f"Number of records under the column 'title' that have MW: {count_title_mw}")


Number of records under the column 'title' that have MW: 283


In [7]:
# show me the row that has null for title_mw
df_null_title_mw = df[df['title_mw'].isnull() & df['mw'].notnull()]
print(df_null_title_mw[['aid_data_record_id', 'title', 'description']])

     aid_data_record_id                                              title  \
12                 1049  CMEC provides $551.5 million supplier's credit...   
15                 2251  CNEEC provides $3 million supplier credit for ...   
22                30244  CET provides $1.002 billion supplier's credit ...   
26                31023  China Eximbank provides RMB 592.4 million gove...   
33                33244  China Eximbank provides $300 million preferent...   
..                  ...                                                ...   
356               96053  CDB and China Eximbank provide loan for 552 (4...   
357               96055  Bank of China contributes to $765 million synd...   
359               96465  Bank of China contributes to $81 million syndi...   
360               97774  CDB provides loan for Malabo Turbogas Plant Ex...   
368               98835  ICBC contributes $42.77 million to syndicated ...   

                                           description  
12   O

In [8]:
# prefer the title_mw over mw, unless title_mw is null
df.loc[df['title_mw'].notnull(), 'mw'] = df['title_mw']


In [9]:
df.drop(columns=['title_mw'],inplace=True)

In [10]:
with open("C:/Users/wikku/aiddata/personal_research/emission_models.pkl", "rb") as f:
    loaded_models = pickle.load(f)

In [11]:
loaded_models

{'co2': RandomForestRegressor(random_state=42),
 'so2': RandomForestRegressor(random_state=42),
 'nox': RandomForestRegressor(random_state=42)}

In [18]:
percentage_running= 0.8 #depends on percentage of time the plant is running, this will be multiplied by the number of hours in a year, 8760
tajikistan_df = df[df['name'] == 'Tajikistan']
tajikistan_df['mw'].iloc[0] = 150
tajikistan_df['mw'].iloc[1] = 50
tajikistan_df['mw'].iloc[2] = 50
tajikistan_df['mw'].iloc[3] = 150
tajikistan_df.rename(columns={'completion_year': 'year'}, inplace=True)
tajikistan_df['generation_mwh'] = tajikistan_df['mw'] * (8760  * percentage_running) #convert MW to MWh, assuming running every hour of the year
tajikistan_df['fuel_group'] = 'coal'
tajikistan_df['fuel_group_gas'] = 0
tajikistan_df['fuel_group_oil'] = 0

X_new = tajikistan_df[['generation_mwh', 'year', 'fuel_group_gas', 'fuel_group_oil']]
tajikistan_df['Predicted_co2'] = loaded_models['co2'].predict(X_new)
tajikistan_df['Predicted_so2'] = loaded_models['so2'].predict(X_new)
tajikistan_df['Predicted_nox'] = loaded_models['nox'].predict(X_new)
tajikistan_df.drop(columns=['fuel_group_gas', 'fuel_group_oil', 'fuel_group'], inplace=True)  # Clean up the DataFrame
tajikistan_df['pred_co2_emissions'] = np.exp(tajikistan_df['Predicted_co2'])
tajikistan_df['pred_so2_emissions'] = np.exp(tajikistan_df['Predicted_so2'])
tajikistan_df['pred_nox_emissions'] = np.exp(tajikistan_df['Predicted_nox'])


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  tajikistan_df['mw'].iloc[0] = 150
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tajikistan_df['mw'].iloc[0] =

In [19]:
tajikistan_df

Unnamed: 0,aid_data_record_id,aid_data_parent_id,title,year,name,description,mw,generation_mwh,Predicted_co2,Predicted_so2,Predicted_nox,pred_co2_emissions,pred_so2_emissions,pred_nox_emissions
99,46211,1868,China Eximbank provides RMB 929.9 million gove...,2016,Tajikistan,"On July 4, 2009, the Export-Import Bank of Chi...",150.0,1051200.0,14.366,8.544,7.434,1734802.648,5136.348,1693.264
133,54106,1868,China Eximbank provides $15 million loan for P...,2014,Tajikistan,"On July 4, 2009, the Export-Import Bank of Chi...",50.0,350400.0,14.164,7.382,8.197,1416241.523,1607.22,3629.921
274,71101,1868,China Development Bank provides $15 million lo...,2014,Tajikistan,"On July 4, 2009, the Export-Import Bank of Chi...",50.0,350400.0,14.164,7.382,8.197,1416241.523,1607.22,3629.921
275,71137,1868,China Eximbank provides $178.9 million prefere...,2016,Tajikistan,"On July 4, 2009, the Export-Import Bank of Chi...",150.0,1051200.0,14.366,8.544,7.434,1734802.648,5136.348,1693.264


In [20]:
tajikistan_df['pred_co2_emissions'].sum()

6302088.343475139