In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def load_data():
    data = pd.read_csv('students.csv')
    return data
df = load_data()

np.random.seed(1)
major_spec = ['TMBA', 'SEMBA', 'ME', 'IMMBA', 'PMBA','FMBA','MFE','IMMS','GBP',"EMBA",'DMFBA']
major_generated = np.random.choice(major_spec, 
    p = [0.08,0.09,0.1,0.07,0.06,0.085,0.115,0.05,0.15,0.095,0.105],
    size = len(df))
# Preprocess

def next_data(df):
    global major_generated
    df = df.drop(['ssc_b', 'hsc_b'], axis = 1)

    column_names=['gender', '10_grade', '12_grade', 'spec_higher_edu',
    'degree_percent','undergrad_major', 'work_exp' ,'employ_test', 
    'post_grad_spec', 'post_grad_percent','status', 'salary' ]
    df.columns = column_names

    df['salary'] = df['salary'].round()
    df['10_grade'] = df['10_grade'].round()
    df['12_grade'] = df['12_grade'].round()
    df['post_grad_percent'] = df['post_grad_percent'].round()
    df['degree_percent'] = df['degree_percent'].round()
    df.drop('post_grad_spec', axis = 1, inplace = True)
    df.loc[df['status']=='Not Placed', 'salary']= 0
    df['department'] = major_generated
    df['gender']=df['gender'].map({'M':'Male', 'F':'Female'})
    return df
dataset = next_data(df=df)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

encoder = OneHotEncoder()
scaler = StandardScaler()
model = RandomForestClassifier()

### First Model -- Classifier

In [4]:
cols_to_drop = ['spec_higher_edu', '10_grade', '12_grade', 'employ_test', 'post_grad_percent', 'salary']
train = dataset.drop(cols_to_drop, axis = 1)
X = train.drop('status', axis = 1)
y = train['status'].map({'Placed':1,"Not Placed":0})

In [6]:
X.head(3)

Unnamed: 0,gender,degree_percent,undergrad_major,work_exp,department
0,Male,58.0,Sci&Tech,No,FMBA
1,Male,77.0,Sci&Tech,Yes,GBP
2,Male,64.0,Comm&Mgmt,No,TMBA


In [8]:
X_dump = pd.get_dummies(X)
X_scale = scaler.fit_transform(X_dump)
X_scale

array([[-1.13869783, -0.73943397,  0.73943397, ..., -0.23221018,
        -0.34610933, -0.35447771],
       [ 1.44551364, -0.73943397,  0.73943397, ..., -0.23221018,
        -0.34610933, -0.35447771],
       [-0.32263105, -0.73943397,  0.73943397, ..., -0.23221018,
        -0.34610933,  2.82105181],
       ...,
       [ 0.90146912, -0.73943397,  0.73943397, ..., -0.23221018,
        -0.34610933, -0.35447771],
       [-1.13869783,  1.35238581, -1.35238581, ..., -0.23221018,
         2.88926047, -0.35447771],
       [-1.81875348, -0.73943397,  0.73943397, ..., -0.23221018,
         2.88926047, -0.35447771]])

In [9]:
classifier_model = RandomForestClassifier()
classifier_model.fit(X_scale, y)

In [10]:
import pickle
pickle.dump(classifier_model, open('classifier_model.pkl', 'wb'))

In [89]:
dataset

Unnamed: 0,gender,10_grade,12_grade,spec_higher_edu,degree_percent,undergrad_major,work_exp,employ_test,post_grad_percent,status,salary,department
0,Male,67.0,91.0,Commerce,58.0,Sci&Tech,No,55.0,59.0,Placed,270000.0,FMBA
1,Male,79.0,78.0,Science,77.0,Sci&Tech,Yes,86.5,66.0,Placed,200000.0,GBP
2,Male,65.0,68.0,Arts,64.0,Comm&Mgmt,No,75.0,58.0,Placed,250000.0,TMBA
3,Male,56.0,52.0,Science,52.0,Sci&Tech,No,66.0,59.0,Not Placed,0.0,IMMBA
4,Male,86.0,74.0,Commerce,73.0,Comm&Mgmt,No,96.8,56.0,Placed,425000.0,SEMBA
...,...,...,...,...,...,...,...,...,...,...,...,...
210,Male,81.0,82.0,Commerce,78.0,Comm&Mgmt,No,91.0,74.0,Placed,400000.0,PMBA
211,Male,58.0,60.0,Science,72.0,Sci&Tech,No,74.0,54.0,Placed,275000.0,DMFBA
212,Male,67.0,67.0,Commerce,73.0,Comm&Mgmt,Yes,59.0,70.0,Placed,295000.0,ME
213,Female,74.0,66.0,Commerce,58.0,Comm&Mgmt,No,70.0,60.0,Placed,204000.0,SEMBA


In [11]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
regress = dataset[dataset['status']=='Placed']
X_new = regress.drop(['spec_higher_edu', '10_grade', '12_grade', 'employ_test', 'salary', 'status'], axis =1)
y_new = regress['salary']

In [13]:
X_new

Unnamed: 0,gender,degree_percent,undergrad_major,work_exp,post_grad_percent,department
0,Male,58.0,Sci&Tech,No,59.0,FMBA
1,Male,77.0,Sci&Tech,Yes,66.0,GBP
2,Male,64.0,Comm&Mgmt,No,58.0,TMBA
4,Male,73.0,Comm&Mgmt,No,56.0,SEMBA
7,Male,66.0,Sci&Tech,Yes,62.0,PMBA
...,...,...,...,...,...,...
209,Male,65.0,Comm&Mgmt,No,56.0,DMFBA
210,Male,78.0,Comm&Mgmt,No,74.0,PMBA
211,Male,72.0,Sci&Tech,No,54.0,DMFBA
212,Male,73.0,Comm&Mgmt,Yes,70.0,ME


In [15]:
scaler = StandardScaler()
regressor_model = RandomForestRegressor()

In [18]:
X_dump_reg = pd.get_dummies(X_new)
X_reg_scale = scaler.fit_transform(X_dump_reg)

In [22]:
X_dump_reg

Unnamed: 0,degree_percent,post_grad_percent,gender_Female,gender_Male,undergrad_major_Comm&Mgmt,undergrad_major_Others,undergrad_major_Sci&Tech,work_exp_No,work_exp_Yes,department_DMFBA,department_EMBA,department_FMBA,department_GBP,department_IMMBA,department_IMMS,department_ME,department_MFE,department_PMBA,department_SEMBA,department_TMBA
0,58.0,59.0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0
1,77.0,66.0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0
2,64.0,58.0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
4,73.0,56.0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
7,66.0,62.0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,65.0,56.0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
210,78.0,74.0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
211,72.0,54.0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
212,73.0,70.0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


In [20]:
regressor_model.fit(X_reg_scale, y_new)

In [21]:
pickle.dump(regressor_model, open('regressor_model.pkl', 'wb'))