### This objective aims to predict the number of employees of a company that hires an alumni, based on professional attributes such as industry, number of connections, and specialties.

In [5]:
#import librairies
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# PostgreSQL Connection
engine = create_engine("postgresql+psycopg2://postgres:postgres123@localhost:5432/DW")
query = """
SELECT a."Industry",
       f."num_of_connections",
       c."company_size", c."company_specialties"
FROM "Fact_Employability" f
JOIN "Dim_Alumini" a ON f."AluminiFK" = a."AluminiKey"
JOIN "Dim_Company" c ON f."CompanyFK" = c."CompanyKey"
"""
conn = engine.raw_connection()
df = pd.read_sql(query, conn)
conn.close()

df.head()

  df = pd.read_sql(query, conn)


Unnamed: 0,Industry,num_of_connections,company_size,company_specialties
0,"technology, information and internet",5685,164,"ESG,Real Estate Software,Software,Consulting,S..."
1,software development,1875,103,"Energy,Trading,Commodities,Environmental"
2,electrical equipment manufacturing,2117,34375,"Aircraft,Aerostructures,Helicopters,Electronic..."
3,space research and technology,1931,144150,"freelancers,agencies"
4,"transportation, logistics, supply chain and st...",1659,69028,"Container Shipping,Container Terminals,Retail,..."


We compared Random Forest, Gradient Boosting, and CatBoost because they are suitable for predicting continuous outcomes from structured data. 
Random Forest is robust and easy to use but showed lower accuracy.
Gradient Boosting offers high precision and handles complex patterns well, though it’s slower to train. 
CatBoost is efficient with categorical features and handles missing data, but it’s slightly more complex to tune. 

In [6]:
# Data Cleaning
df['company_size'] = pd.to_numeric(df['company_size'], errors='coerce')
df['num_of_connections'] = pd.to_numeric(df['num_of_connections'], errors='coerce')
df = df.dropna()
df = df[df['company_size'] < df['company_size'].quantile(0.98)]  # remove outliers

# Features and Target
X = df[['Industry', 'company_specialties', 'num_of_connections']]
y = df['company_size']

preprocessor = ColumnTransformer([
    ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ['Industry', 'company_specialties']),
], remainder='passthrough')

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Regression Models
models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42)  # ✅ Added here
}

# Training and Evaluation
results = []
for name, model in models.items():
    pipe = Pipeline([
        ("prep", preprocessor),
        ("reg", model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    print(f"\n📊 {name}")
    print(f"✅ MAE : {mae:.2f} | RMSE : {rmse:.2f} | R² : {r2:.4f}")
    results.append((name, mae, rmse, r2))

# Summary
df_results = pd.DataFrame(results, columns=["Model", "MAE", "RMSE", "R2"]).sort_values(by="R2", ascending=False)
print("\n🎯 Performance Summary:\n", df_results)



📊 RandomForest
✅ MAE : 7287.48 | RMSE : 13869.43 | R² : 0.5571

📊 GradientBoosting
✅ MAE : 5349.09 | RMSE : 11967.00 | R² : 0.6703

📊 CatBoost
✅ MAE : 5842.63 | RMSE : 12040.51 | R² : 0.6662

🎯 Performance Summary:
               Model          MAE          RMSE        R2
1  GradientBoosting  5349.091848  11966.999373  0.670269
2          CatBoost  5842.626956  12040.511504  0.666206
0      RandomForest  7287.479828  13869.429489  0.557100


=> The best performing algorithm is GradientBoosting, with the lowest MAE (5349.09), RMSE (11967.00), and the highest R² (0.6703). This indicates it has the best overall accuracy in predicting the target variable compared to the other models.