In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor

In [35]:
df = pd.read_csv('Admission_Predict.csv')
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         400 non-null    int64  
 1   GRE Score          400 non-null    int64  
 2   TOEFL Score        400 non-null    int64  
 3   University Rating  400 non-null    int64  
 4   SOP                400 non-null    float64
 5   LOR                400 non-null    float64
 6   CGPA               400 non-null    float64
 7   Research           400 non-null    int64  
 8   Chance of Admit    400 non-null    float64
dtypes: float64(4), int64(5)
memory usage: 28.3 KB


In [37]:
target_col = 'Chance of Admit '
X = df.drop(target_col, axis=1)
sample_row = X.iloc[0,:].to_dict()
X.columns = X.columns.str.replace(' ', '')
X.columns = X.columns.str.replace('.', '')
y = pd.DataFrame(df[target_col])


X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0)

df.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

In [38]:
model = LGBMRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_valid)

print(f'mse: {mean_squared_error(y_valid, preds)}')
print(f'mae: {mean_absolute_error(y_valid, preds)}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003754 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 270
[LightGBM] [Info] Number of data points in the train set: 280, number of used features: 8
[LightGBM] [Info] Start training from score 0.725893
mse: 0.003931790412582483
mae: 0.044956236540495305


In [43]:
from pydantic import BaseModel, Field, ValidationError

class Student(BaseModel):
    Serial_No: int = Field(..., alias='Serial No.')
    GRE_Score: int = Field(..., alias='GRE Score')
    TOEFL_Score: int = Field(..., alias='TOEFL Score')
    University_Rating: int = Field(..., alias='University Rating')
    SOP: float = Field(..., alias='SOP')
    LOR: float = Field(..., alias='LOR ')
    CGPA: float = Field(..., alias='CGPA')
    Research: int = Field(..., alias='Research')

print(sample_row)
student = Student(**sample_row)
inputs = pd.DataFrame([student.model_dump()])
inputs.columns = inputs.columns.str.replace(' ', '')
inputs.columns = inputs.columns.str.replace('.', '')

pred = model.predict(inputs)[0]
float(pred)


{'Serial No.': 1.0, 'GRE Score': 337.0, 'TOEFL Score': 118.0, 'University Rating': 4.0, 'SOP': 4.5, 'LOR ': 4.5, 'CGPA': 9.65, 'Research': 1.0}


0.9424003820733591

In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    try:
        student = Student(**request.json)
        inputs = pd.DataFrame([student.model_dump()])
        inputs.columns = inputs.columns.str.replace(' ', '')
        inputs.columns = inputs.columns.str.replace('.', '')

        pred = model.predict(inputs)[0]
        response = {
            'prediction': float(pred)
        }

        return jsonify(response), 200

    except ValidationError as e:
        return jsonify(e.errors()), 400

if __name__=='__main__':
    app.run()