In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Load your dataset
unemp = pd.read_csv('Unemployment in India.csv', sep='\t')

# Remove leading/trailing spaces in column names
unemp.columns = unemp.columns.str.strip()

# Drop rows with missing values
unemp.dropna(inplace=True)

# Specify the names of categorical columns that need encoding
cat = ['Region', 'Area']  # Replace these with your actual categorical columns

# Encode categorical variables
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
for i in cat:
    unemp[i] = lb.fit_transform(unemp[i])

# Ensure all features are numeric (convert non-numeric columns to numeric where needed)
unemp = unemp.apply(pd.to_numeric, errors='coerce')

# Separate features and target
X = unemp.drop('Estimated Unemployment Rate (%)', axis=1)
y = unemp['Estimated Unemployment Rate (%)']

# Feature Scaling (Standardize features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create polynomial features (degree 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Evaluate Linear Regression
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Model Performance:")
print(f"MAE: {mae_lr:.2f}, RMSE: {rmse_lr:.2f}, R²: {r2_lr:.2f}")

# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Model Performance:")
print(f"MAE: {mae_rf:.2f}, RMSE: {rmse_rf:.2f}, R²: {r2_rf:.2f}")

# Gradient Boosting Model
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

# Evaluate Gradient Boosting
mae_gb = mean_absolute_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_gb))
r2_gb = r2_score(y_test, y_pred_gb)

print("\nGradient Boosting Model Performance:")
print(f"MAE: {mae_gb:.2f}, RMSE: {rmse_gb:.2f}, R²: {r2_gb:.2f}")

# Cross-validation for Random Forest and Gradient Boosting
rf_cv_scores = cross_val_score(rf_model, X_poly, y, cv=5, scoring='neg_mean_squared_error')
gb_cv_scores = cross_val_score(gb_model, X_poly, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE to positive for interpretation
rf_cv_rmse = np.sqrt(-rf_cv_scores.mean())
gb_cv_rmse = np.sqrt(-gb_cv_scores.mean())

print("\nCross-validation RMSE:")
print(f"Random Forest: {rf_cv_rmse:.2f}")
print(f"Gradient Boosting: {gb_cv_rmse:.2f}")


ValueError: Input X contains NaN.
PolynomialFeatures does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values