<a href="https://colab.research.google.com/github/wnubhav/GPA-Predictor-Sem-4-/blob/main/GPA_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data preparation and Preprocessing

In [110]:
!pip install pdfplumber pandas




In [112]:
#Dataset preparation
import pdfplumber
import pandas as pd

# Function to extract SGPA data
def extract_sgpa(pdf_path, semester_number):
    data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            table = page.extract_table()
            if table:
                headers = table[0]
                for row in table[1:]:
                    record = dict(zip(headers, row))
                    data.append({
                        'Examroll': record.get('Examroll', '').strip(),
                        'Student Name': record.get('Student Name', '').strip(),
                        f'SGPA Sem {semester_number}': record.get('SGPA', '').strip()
                    })
    return pd.DataFrame(data)

# Extract SGPA for each semester
df1 = extract_sgpa("/content/BTech_Result_CST_1st.pdf", 1)
df2 = extract_sgpa("/content/BTech_Result_CST_2nd.pdf", 2)
df3 = extract_sgpa("/content/BTech_Result_CST_3rd.pdf", 3)

# Merge on Roll Number and Name
merged = df1.merge(df2, on=['Examroll', 'Student Name'], how='outer')
merged = merged.merge(df3, on=['Examroll', 'Student Name'], how='outer')

# Display merged data
merged.head()




Unnamed: 0,Examroll,Student Name,SGPA Sem 1,SGPA Sem 2,SGPA Sem 3
0,2022CSB013,Kapil Dev Raykwar,0.0,2.46,
1,2022CSB056,Anup Sarkar,5.63,6.21,5.75
2,2022CSB063,Dipanjan Dhibar,3.5,2.38,
3,2022CSB100,Davis Dhanda,3.54,2.46,
4,2023CSB001,Sanchari Ray,5.92,7.5,7.42


In [113]:
merged.to_csv("/content/merged_sgpa.csv", index=False)

from google.colab import files
files.download("/content/merged_sgpa.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [114]:
#Missing value handling
merged.dropna(subset=['SGPA Sem 1', 'SGPA Sem 2', 'SGPA Sem 3'], inplace=True)


In [115]:

merged.dropna(axis=1, inplace=True)


In [116]:
merged.tail()

Unnamed: 0,Examroll,Student Name,SGPA Sem 1,SGPA Sem 2,SGPA Sem 3
100,2023CSB098,Wadadare Piyush Rajendra,8.25,7.88,8.38
101,2023CSB099,Kyatham Saichaithanya,5.63,6.29,5.88
102,2023CSB100,Achanta Sravanthi,8.08,7.58,8.17
103,2023CSB101,Pratyush Dhital,8.67,7.71,8.58
104,2023CSB102,Sneha Anand Das,7.08,6.54,5.96


In [117]:
merged = merged.drop(merged.index[3])


In [118]:
merged.head()

Unnamed: 0,Examroll,Student Name,SGPA Sem 1,SGPA Sem 2,SGPA Sem 3
1,2022CSB056,Anup Sarkar,5.63,6.21,5.75
4,2023CSB001,Sanchari Ray,5.92,7.5,7.42
5,2023CSB002,Avighna Basak,5.5,7.04,6.54
8,2023CSB005,Shivank Prashanth,6.0,8.04,7.83
9,2023CSB006,Chandrashekhar Basu,7.71,8.29,9.38


In [119]:
merged.to_csv("/content/merged_sgpa_updated.csv", index=False)

from google.colab import files
files.download("/content/merged_sgpa_updated.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Training,Testing and Evaluation

In [120]:
import torch
import torch.nn as nn


class SGPAPredictor(nn.Module):
    def __init__(self):
        super(SGPAPredictor, self).__init__()
        self.fc1 = nn.Linear(3, 64)  # input features = 3 (SGPA Sem 1, 2, 3)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.dropout(self.relu(self.fc3(x)))
        x = self.fc4(x)
        return x


In [121]:
import pandas as pd

def load_and_prepare_data(file_path=None):
    merged = pd.read_csv('/content/merged_sgpa_updated.csv')

    # Calculate weighted increase in SGPA and predict SGPA Sem 4
    merged['Weighted_Increase'] = (0.3 * (merged['SGPA Sem 2'] - merged['SGPA Sem 1']) +
                                  0.7 * (merged['SGPA Sem 3'] - merged['SGPA Sem 2']))
    merged['SGPA Sem 4'] = (merged['SGPA Sem 3'] + merged['Weighted_Increase']).clip(upper=10)

    # Prepare input features and target
    features = merged[['SGPA Sem 1', 'SGPA Sem 2', 'SGPA Sem 3']].values
    target = merged['SGPA Sem 4'].values.reshape(-1, 1)
    names = merged['Student Name'].values

    return features, target, names, merged


In [128]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import torch.optim as optim

def train_model_with_cross_validation(X, y, k_folds=5):
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

    # Track metrics for each fold
    fold_metrics = []
    best_model = None
    best_scaler_X = None
    best_scaler_y = None
    best_mse = float('inf')

    print(f"Starting {k_folds}-fold cross-validation")

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"\nFold {fold+1}/{k_folds}")

        # Split data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Scale features and target
        scaler_X = MinMaxScaler()
        scaler_y = MinMaxScaler()
        X_train_scaled = scaler_X.fit_transform(X_train)
        y_train_scaled = scaler_y.fit_transform(y_train)
        X_val_scaled = scaler_X.transform(X_val)
        y_val_scaled = scaler_y.transform(y_val)

        # Convert to pytorch tensors
        X_train_tensor = torch.FloatTensor(X_train_scaled)
        y_train_tensor = torch.FloatTensor(y_train_scaled)
        X_val_tensor = torch.FloatTensor(X_val_scaled)
        y_val_tensor = torch.FloatTensor(y_val_scaled)

        # Initialize model, loss, optimizer
        model = SGPAPredictor()
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

        # Training hyperparameters
        epochs = 2000
        best_fold_val_loss = float('inf')
        patience = 100
        counter = 0

        for epoch in range(epochs):
            # Training step
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()

            # Validation step
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_val_tensor)
                val_loss = criterion(val_outputs, y_val_tensor)

            # Early stopping check
            if val_loss < best_fold_val_loss:
                best_fold_val_loss = val_loss
                counter = 0
                fold_model_path = f'sgpa_model_fold_{fold+1}.pth'
                torch.save(model.state_dict(), fold_model_path)
            else:
                counter += 1

            if counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

            if (epoch+1) % 200 == 0:
                print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {loss.item():.6f}, Val Loss: {val_loss.item():.6f}")

        # Load best model for this fold
        model.load_state_dict(torch.load(f'sgpa_model_fold_{fold+1}.pth'))


        model.eval()
        with torch.no_grad():
            val_pred_scaled = model(X_val_tensor)
            val_pred = scaler_y.inverse_transform(val_pred_scaled.numpy())
            val_true = y_val

        # Calculate metrics
        mse = mean_squared_error(val_true, val_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(val_true, val_pred)

        print(f"Fold {fold+1} - MSE: {mse:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

        fold_metrics.append({
            'fold': fold+1,
            'mse': mse,
            'rmse': rmse,
            'r2': r2
        })

        # Keep track of best model
        if mse < best_mse:
            best_mse = mse
            best_model = model
            best_scaler_X = scaler_X
            best_scaler_y = scaler_y

    # Print summary statistics
    metrics_df = pd.DataFrame(fold_metrics)
    print("\nCross-validation complete. Summary of results:")
    print(f"Mean MSE: {metrics_df['mse'].mean():.4f} ± {metrics_df['mse'].std():.4f}")
    print(f"Mean RMSE: {metrics_df['rmse'].mean():.4f} ± {metrics_df['rmse'].std():.4f}")
    print(f"Mean R²: {metrics_df['r2'].mean():.4f} ± {metrics_df['r2'].std():.4f}")

    # Save the best model
    torch.save(best_model.state_dict(), 'best_sgpa_model_cv.pth')

    return best_model, best_scaler_X, best_scaler_y, metrics_df


In [129]:
def final_model_training(X, y):

    # Scale features and target
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y)

    # Convert to tensors
    X_tensor = torch.FloatTensor(X_scaled)
    y_tensor = torch.FloatTensor(y_scaled)

    # Initialize model
    model = SGPAPredictor()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

    # Training
    epochs = 3000
    print("\nTraining final model on entire dataset...")
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_tensor)
        loss = criterion(outputs, y_tensor)
        loss.backward()
        optimizer.step()

        if (epoch+1) % 500 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.6f}")

    # Save the final model
    torch.save(model.state_dict(), 'final_sgpa_model.pth')
    return model, scaler_X, scaler_y


In [130]:
def predict_sgpa(model, scaler_X, scaler_y, name, X, names):
    if name in names:
        index = list(names).index(name)
        student_features = X[index].reshape(1, -1)
        student_scaled = scaler_X.transform(student_features)
        input_tensor = torch.FloatTensor(student_scaled)
        model.eval()
        with torch.no_grad():
            prediction_scaled = model(input_tensor)
        prediction = scaler_y.inverse_transform(prediction_scaled.numpy())
        return prediction[0][0]
    else:
        return None

def predict_for_all_students(model, scaler_X, scaler_y, X, names):

    predictions = []

    for i, name in enumerate(names):
        student_features = X[i].reshape(1, -1)
        student_scaled = scaler_X.transform(student_features)
        input_tensor = torch.FloatTensor(student_scaled)

        model.eval()
        with torch.no_grad():
            prediction_scaled = model(input_tensor)

        prediction = scaler_y.inverse_transform(prediction_scaled.numpy())
        predictions.append({
            'Student Name': name,
            'SGPA Sem 1': X[i][0],
            'SGPA Sem 2': X[i][1],
            'SGPA Sem 3': X[i][2],
            'Predicted SGPA Sem 4': prediction[0][0]
        })

    return pd.DataFrame(predictions)


In [131]:
def main():
    # Load data
    X, y, names, df = load_and_prepare_data()

    # Perform cross-validation
    cv_model, cv_scaler_X, cv_scaler_y, metrics_df = train_model_with_cross_validation(X, y, k_folds=5)

    # Train final model on entire dataset
    final_model, final_scaler_X, final_scaler_y = final_model_training(X, y)

    print("\nCross-validation and final model training complete.")

    # Generate predictions for all students using the final model
    predictions_df = predict_for_all_students(final_model, final_scaler_X, final_scaler_y, X, names)

    # Save predictions to CSV
    predictions_df.to_csv('sgpa_predictions.csv', index=False)
    print("All predictions saved to 'sgpa_predictions.csv'")

    # Interactive prediction mode
    print("\nYou can now predict SGPA for Semester 4 for specific students.")
    while True:
        student_name = input("Enter student name (or 'q' to quit): ").strip()
        if student_name.lower() == 'q':
            break
        prediction = predict_sgpa(final_model, final_scaler_X, final_scaler_y, student_name, X, names)
        if prediction is not None:
            print(f"Predicted Semester 4 SGPA for {student_name}: {prediction:.2f}")
        else:
            print(f"Student '{student_name}' not found in the dataset.")


if __name__ == '__main__':
    main()


Starting 5-fold cross-validation

Fold 1/5
Epoch [200/2000], Train Loss: 0.055169, Val Loss: 0.006765
Epoch [400/2000], Train Loss: 0.041910, Val Loss: 0.004696
Early stopping at epoch 460
Fold 1 - MSE: 0.1609, RMSE: 0.4012, R²: 0.8838

Fold 2/5
Early stopping at epoch 146
Fold 2 - MSE: 0.9175, RMSE: 0.9578, R²: 0.4841

Fold 3/5
Epoch [200/2000], Train Loss: 0.036726, Val Loss: 0.227572
Epoch [400/2000], Train Loss: 0.030750, Val Loss: 0.201540
Epoch [600/2000], Train Loss: 0.020931, Val Loss: 0.176674
Epoch [800/2000], Train Loss: 0.017172, Val Loss: 0.178215
Early stopping at epoch 847
Fold 3 - MSE: 5.4338, RMSE: 2.3311, R²: 0.5607

Fold 4/5
Epoch [200/2000], Train Loss: 0.035686, Val Loss: 0.007376
Epoch [400/2000], Train Loss: 0.036474, Val Loss: 0.001483
Early stopping at epoch 519
Fold 4 - MSE: 0.1561, RMSE: 0.3951, R²: 0.9047

Fold 5/5
Early stopping at epoch 189
Fold 5 - MSE: 0.3743, RMSE: 0.6118, R²: 0.7449

Cross-validation complete. Summary of results:
Mean MSE: 1.4085 ± 2.2