In [1]:
import pandas as pd

# Load the dataset
file_path = r'C:\Users\HRUSHIKESH\Downloads\India_GDP_1960-2022.csv'  
df = pd.read_csv(file_path)

# Display the first few rows
print(df.head())


   Year  GDP in (Billion) $  Per Capita in rupees  Growth %
0  2021             3173.40                182160      8.95
1  2020             2667.69                154640     -6.60
2  2019             2831.55                165760      3.74
3  2018             2702.93                159840      6.45
4  2017             2651.47                158480      6.80


In [2]:
X = df[['Year', 'Per Capita in rupees', 'Growth %']]
y = df['GDP in (Billion) $']

In [4]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor



# Step 1: Apply polynomial feature transformation (degree 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Step 2: Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 4: Perform Grid Search to find the best parameters for XGBoost
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Step 5: Train the optimal XGBoost model
xgb_optimal = XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
xgb_optimal.fit(X_train, y_train)

# Step 6: Evaluate the model on the test set
y_pred_test = xgb_optimal.predict(X_test)
print("R-squared (Test Set):", r2_score(y_test, y_pred_test))
print("Mean Squared Error (Test Set):", mean_squared_error(y_test, y_pred_test))

# Step 7: Cross-validation with the optimal model
cv_scores = cross_val_score(xgb_optimal, X_train, y_train, cv=5, scoring='r2')
print("Cross-Validation R-squared scores:", cv_scores)
print("Average Cross-Validation R-squared:", np.mean(cv_scores))

# Step 8: Predict GDP for new data 
new_data = pd.DataFrame({'Year': [2025], 'Per Capita in rupees': [155320], 'Growth %': [6]})

# Scale and transform the new data
new_data_scaled = scaler.transform(poly.transform(new_data))

# Predict GDP for the new data
predicted_gdp = xgb_optimal.predict(new_data_scaled)
print("Predicted GDP for New Data:", predicted_gdp[0])


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
R-squared (Test Set): 0.9734558673322943
Mean Squared Error (Test Set): 26291.072454711797
Cross-Validation R-squared scores: [0.95873851 0.93993482 0.88406522 0.99082122 0.96606184]
Average Cross-Validation R-squared: 0.9479243220098272
Predicted GDP for New Data: 2641.8928
