# IELTS Score Prediction using Random Forest, XGBoost, and Neural Network

In [10]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import joblib

# Load dataset
df = pd.read_json("students_dataset.json")
df.head()


Unnamed: 0,id,board,stream,gender,english_score,percentage,ielts_score,toefl_score,gre_score,gmat_score,...,geography,total_marks,accountancy,business_studies,economics,math,physics,chemistry,biology,computer_science
0,1,ICSE,Arts,Female,55,54.6,5.0,60,277,496,...,46.0,273,,,,,,,,
1,2,CBSE,Commerce,Male,45,52.6,5.0,60,290,521,...,,263,73.0,57.0,55.0,33.0,,,,
2,3,CBSE,Arts,Female,68,59.8,5.0,60,305,500,...,58.0,299,,,,,,,,
3,4,Rajasthan State Board,Science,Female,83,70.0,6.0,74,315,540,...,,350,,,,86.0,59.0,54.0,68.0,68.0
4,5,Odisha State Board,Commerce,Male,72,85.6,5.5,74,330,608,...,,428,98.0,86.0,77.0,95.0,,,,


In [11]:

# Feature selection
X = df[["board", "gender", "percentage", "english_score"]]
y = df["ielts_score"]

# Encode categorical variables
le_board = LabelEncoder()
le_gender = LabelEncoder()
X["board"] = le_board.fit_transform(X["board"])
X["gender"] = le_gender.fit_transform(X["gender"])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train[["percentage", "english_score"]] = scaler.fit_transform(X_train[["percentage", "english_score"]])
X_test[["percentage", "english_score"]] = scaler.transform(X_test[["percentage", "english_score"]])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["board"] = le_board.fit_transform(X["board"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["gender"] = le_gender.fit_transform(X["gender"])


In [12]:

# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_r2 = r2_score(y_test, rf_pred)
rf_rmse = mean_squared_error(y_test, rf_pred) ** 0.5


In [13]:

# XGBoost Model
xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_r2 = r2_score(y_test, xgb_pred)
xgb_rmse = mean_squared_error(y_test, xgb_pred) ** 0.5


In [14]:

# Neural Network Model
nn_model = Sequential([
    Dense(16, activation='relu', input_dim=X_train.shape[1]),
    Dense(8, activation='relu'),
    Dense(1)
])
nn_model.compile(optimizer=Adam(learning_rate=0.01), loss='mse')
nn_model.fit(X_train, y_train, epochs=100, verbose=0)

nn_pred = nn_model.predict(X_test).flatten()
nn_r2 = r2_score(y_test, nn_pred)
nn_rmse = mean_squared_error(y_test, nn_pred) ** 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 529us/step


In [15]:

# Compare results
results = pd.DataFrame({
    "Model": ["Random Forest", "XGBoost", "Neural Network"],
    "R2 Score": [rf_r2, xgb_r2, nn_r2],
    "RMSE": [rf_rmse, xgb_rmse, nn_rmse]
})
print(results)

# Save models
joblib.dump(rf_model, "rf_model.pkl")
joblib.dump(xgb_model, "xgb_model.pkl")
nn_model.save("nn_model.h5")
joblib.dump(le_board, "le_board.pkl")
joblib.dump(le_gender, "le_gender.pkl")
joblib.dump(scaler, "scaler.pkl")

print("✅ All models and encoders saved successfully!")




            Model  R2 Score      RMSE
0   Random Forest  0.759840  0.459464
1         XGBoost  0.797202  0.422214
2  Neural Network  0.791501  0.428108
✅ All models and encoders saved successfully!
