In [None]:
try:
    import pandas as pd
    import numpy as np
    import joblib
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.multioutput import MultiOutputRegressor
    from sklearn.linear_model import LinearRegression
    print('✅ All required packages are already installed')
except ImportError as e:
    print(f'❌ Missing package: {str(e)}')
    print('Please run: pip install pandas numpy scikit-learn joblib')

: 

# ESG & SDG Prediction Notebook
This notebook downloads World Bank project data, processes it, generates ESG scores, and trains regression & SDG models.

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression

print('✅ Libraries loaded')

## Download World Bank Data

In [None]:
url = 'https://datacatalogfiles.worldbank.org/ddh-published/0038022/DR0038022/World_Bank_Projects_and_Operations.csv'
df = pd.read_csv(url)
df = df[['id','countryname','project_abstract.cdata']]
df.columns = ['ProjectID','Country','Description']
df.dropna(inplace=True)
df.to_csv('projects.csv', index=False)
df.head()

## Generate ESG scores

In [None]:
def score_esg(text):
    text = str(text).lower()
    E_keywords = ['renewable','solar','wind','climate','water','carbon','forest','pollution','sustainability']
    S_keywords = ['education','health','community','women','youth','poverty','training','employment','social','housing']
    G_keywords = ['governance','transparency','policy','regulation','anti-corruption','institution','audit','compliance','reform']
    words = text.split()
    def calc_score(keywords): return round(min(1, sum(w in text for w in keywords) / 5), 2)
    return calc_score(E_keywords), calc_score(S_keywords), calc_score(G_keywords)

df[['E','S','G']] = df['Description'].apply(lambda x: pd.Series(score_esg(x)))
df.to_csv('projects_scored.csv', index=False)
df.head()

## Train Regression Model

In [None]:
X = df['Description']
y = df[['E','S','G']]
vectorizer = TfidfVectorizer(max_features=4000)
X_vec = vectorizer.fit_transform(X)
model = MultiOutputRegressor(LinearRegression()).fit(X_vec, y)
joblib.dump(model, 'esg_regression.pkl'); joblib.dump(vectorizer, 'vectorizer.pkl')
print('✅ ESG Regression Model Trained')

## Prediction Function

In [None]:
def predict_esg(text):
    vec = joblib.load('vectorizer.pkl')
    model = joblib.load('esg_regression.pkl')
    pred = model.predict(vec.transform([text]))[0]
    return {'Environment': round(pred[0],2),'Social': round(pred[1],2),'Governance': round(pred[2],2)}

predict_esg('This project installs solar microgrids for rural villages and trains women workers.')