In [30]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_csv("Placement_Data_Full_Class.csv")

In [None]:
df.head()

In [None]:
df.info()

In [8]:
cols = ["gender", "ssc_b", "hsc_b", "hsc_s", "degree_t", "workex", "specialisation", "status"]

In [5]:
df.isnull().sum()

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

In [6]:
df = df.fillna(0)

In [7]:
df.isnull().sum()

sl_no             0
gender            0
ssc_p             0
ssc_b             0
hsc_p             0
hsc_b             0
hsc_s             0
degree_p          0
degree_t          0
workex            0
etest_p           0
specialisation    0
mba_p             0
status            0
salary            0
dtype: int64

In [9]:
def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

In [10]:
for col in cols:
    df[col] = label_encoder(df[col])

gender ['F' 'M']
ssc_b ['Central' 'Others']
hsc_b ['Central' 'Others']
hsc_s ['Arts' 'Commerce' 'Science']
degree_t ['Comm&Mgmt' 'Others' 'Sci&Tech']
workex ['No' 'Yes']
specialisation ['Mkt&Fin' 'Mkt&HR']
status ['Not Placed' 'Placed']


In [None]:
df.head()

In [12]:
df["gender"].value_counts()

1    139
0     76
Name: gender, dtype: int64

In [None]:
sns.countplot(x="gender", data=df)

In [None]:
sns.lineplot(x="gender", y="salary", data=df)

In [20]:
X = df.drop("salary", axis=1)
y = df["salary"]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4242)

In [31]:
xgb = XGBRegressor(max_depth= 4, n_estimators=500)
xgb.fit(X_train, y_train)

In [32]:
y_pred = xgb.predict(X_test)

In [33]:
r2_score(y_test, y_pred)

0.5195830082834799

In [34]:
dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=20)
dt.fit(X_train, y_train)

In [35]:
y_pred = dt.predict(X_test)

In [36]:
r2_score(y_test, y_pred)

0.5686861190236734

In [37]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [38]:
y_pred = linreg.predict(X_test)

In [39]:
r2_score(y_test, y_pred)

0.5791355790837354

In [40]:
pickle.dump(linreg, open("linreg.pkl", "wb"))