In [2]:
#importing the library
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

In [3]:
#loading the dataset
df=pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')
#showing the data
df

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


In [4]:
#getting information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB


In [12]:
#creating a function to encode binary value
def binary_encode(df,column_dict):
    #creating a copy of the dataframe
    df=df.copy()
    #looping through the dictionary 
    for column,positive_value in column_dict.items():
        #applying lambda function to convert positive value with 1 and else 0
        df[column]=df[column].apply(lambda x:1 if x==positive_value else 0)
    #returning the datafram
    return df




In [22]:
#creating a onehot function
def onehot_encode(df,column_dict):
    #creating the copy of the dataframe
    df=df.copy()
    #looping through the list of column stored in the dictionary
    for column,prefix in column_dict.items():
        #creating a dummies column 
        dummies=pd.get_dummies(df[column],prefix=prefix)
        #concating the dummies column with original dataframe
        df=pd.concat([df,dummies],axis=1)
        #dropping the original column
        df=df.drop(column,axis=1)
    return df

In [58]:
#creating the  function for preprocessing
def preprocessing(df):
    df=df.copy()
    
    #Encode categorical features
    binary_feature_dict={
        'gender':'M',
        'ssc_b':'Central',
        'hsc_b':'Central',
        'workex':'Yes',
        'specialisation':'Mkt&Fin',
        'status':'Placed'
    }
    
    #nominal feature
    nominal_feature_dict={
        'hsc_s':'hsc',
        'degree_t':'deg'
    }
    
    #now apply the functioin
    df=binary_encode(df,binary_feature_dict)
    df=onehot_encode(df,nominal_feature_dict)
    
    #
    missing_salaries=df[df.isna().sum(axis=1)>0]
    missing_salaries=missing_salaries.drop('salary',axis=1)
    missing_salary_ids=missing_salaries['sl_no'].reset_index(drop=True).copy()
    
    #dropping the rows with missing salary
    
    df=df.drop(missing_salaries.index,axis=0).reset_index(drop=True)
    
    
    df=df.drop('sl_no',axis=1)
    
    missing_salaries=missing_salaries.drop('sl_no',axis=1)
    
    #Splitting the data between the target and feature
    
    y=df['salary'].copy()
    x=df.drop('salary',axis=1)
    
    #train_test_split
    
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=1)
    
    #scaling the dataset
    
    scaler=StandardScaler()
    
    #fiting the scaler to the train test
    
    
    scaler.fit(x_train)
    
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns)
    x_test= pd.DataFrame(scaler.transform(x_test),columns=x_test.columns)
    missing_salaries=scaler.transform(missing_salaries)
    
    
    
    
    return x_train,x_test,y_train,y_test,missing_salaries,missing_salary_ids

In [59]:
x_train,x_test,y_train,y_test,missing_salaries,missing_salary_ids=preprocessing(df)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(118, 17)
(30, 17)
(118,)
(30,)


In [51]:
x_train

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,status,hsc_Arts,hsc_Commerce,hsc_Science,deg_Comm&Mgmt,deg_Others,deg_Sci&Tech
0,0.689202,-0.699673,0.872464,0.242317,-0.857493,-0.880063,1.126601,-1.148410,0.771100,-0.990938,0.0,-0.210352,0.918559,-0.842701,0.649374,-0.210352,-0.583874
1,-1.450953,0.097593,0.872464,-1.513269,-0.857493,0.080469,-0.887625,-1.253372,-1.296849,0.625789,0.0,-0.210352,-1.088662,1.186661,0.649374,-0.210352,-0.583874
2,-1.450953,1.749071,-1.146179,-0.584507,-0.857493,-0.167410,-0.887625,-0.998464,0.771100,-0.437706,0.0,-0.210352,-1.088662,1.186661,-1.539944,-0.210352,1.712698
3,-1.450953,1.459778,-1.146179,-1.853060,-0.857493,3.488809,-0.887625,-0.974472,-1.296849,1.356270,0.0,-0.210352,-1.088662,1.186661,-1.539944,-0.210352,1.712698
4,0.689202,1.806019,-1.146179,1.997903,-0.857493,-0.074456,-0.887625,1.700568,-1.296849,0.137011,0.0,-0.210352,0.918559,-0.842701,0.649374,-0.210352,-0.583874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,0.689202,-0.585778,0.872464,0.752003,1.166190,0.545243,1.126601,0.875864,-1.296849,-0.570195,0.0,-0.210352,0.918559,-0.842701,0.649374,-0.210352,-0.583874
114,0.689202,-0.244093,-1.146179,-1.060215,-0.857493,-0.539229,-0.887625,1.142018,0.771100,-1.669498,0.0,-0.210352,0.918559,-0.842701,0.649374,-0.210352,-0.583874
115,0.689202,-0.244093,0.872464,-0.720424,-0.857493,-0.539229,1.126601,-1.298356,-1.296849,-0.699104,0.0,-0.210352,-1.088662,1.186661,0.649374,-0.210352,-0.583874
116,-1.450953,0.325383,-1.146179,0.412212,-0.857493,0.700167,1.126601,0.575971,0.771100,0.994610,0.0,-0.210352,0.918559,-0.842701,0.649374,-0.210352,-0.583874


In [43]:
missing_salaries

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,status,hsc_Arts,hsc_Commerce,hsc_Science,deg_Comm&Mgmt,deg_Others,deg_Sci&Tech
3,1,56.0,1,52.0,1,52.00,0,66.00,0,59.43,0,0,0,1,0,0,1
5,1,55.0,0,49.8,0,67.25,1,55.00,1,51.58,0,0,0,1,0,0,1
6,0,46.0,0,49.2,0,79.00,0,74.28,1,53.29,0,0,1,0,1,0,0
9,1,58.0,1,70.0,1,61.00,0,54.00,1,52.21,0,0,1,0,1,0,0
12,0,47.0,1,55.0,0,65.00,0,62.00,0,65.04,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,0,67.0,1,70.0,1,65.00,0,88.00,0,71.96,0,0,1,0,0,1,0
201,1,54.2,1,63.0,0,58.00,0,79.00,0,58.44,0,0,0,1,1,0,0
206,1,41.0,1,42.0,1,60.00,0,97.00,1,53.39,0,0,0,1,1,0,0
208,0,43.0,1,60.0,0,65.00,0,92.66,0,62.92,0,0,0,1,1,0,0


In [60]:
missing_salary_ids

0       4
1       6
2       7
3      10
4      13
     ... 
62    199
63    202
64    207
65    209
66    215
Name: sl_no, Length: 67, dtype: int64

# Training the Model

In [54]:
model=Ridge(alpha=100.0)
model.fit(x_train,y_train)
model_r2=model.score(x_test,y_test)
print('Model R^2:{:.5f}'.format(model_r2))

Model R^2:0.05043


In [56]:
missing_salary_prediction=model.predict(missing_salaries)

  "X does not have valid feature names, but"


In [62]:
missing_salary_prediction=pd.Series(missing_salary_prediction)


In [63]:
missing_salary_prediction

0     295565.173179
1     285530.429498
2     263287.942857
3     276714.077399
4     258328.148516
          ...      
62    309669.183085
63    270824.678023
64    298167.802267
65    273494.990754
66    280420.850379
Length: 67, dtype: float64

In [67]:
predicted_salary_with_idno=pd.concat([missing_salary_ids,missing_salary_prediction],axis=1)

In [68]:
predicted_salary_with_idno

Unnamed: 0,sl_no,0
0,4,295565.173179
1,6,285530.429498
2,7,263287.942857
3,10,276714.077399
4,13,258328.148516
...,...,...
62,199,309669.183085
63,202,270824.678023
64,207,298167.802267
65,209,273494.990754


In [21]:
{column:list(x[column].unique()) for column in x.select_dtypes('object').columns}

{'gender': ['M', 'F'],
 'ssc_b': ['Others', 'Central'],
 'hsc_b': ['Others', 'Central'],
 'hsc_s': ['Commerce', 'Science', 'Arts'],
 'degree_t': ['Sci&Tech', 'Comm&Mgmt', 'Others'],
 'workex': ['No', 'Yes'],
 'specialisation': ['Mkt&HR', 'Mkt&Fin'],
 'status': ['Placed', 'Not Placed']}

In [30]:
df[df.isna().sum(axis=1)>0]

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
3,4,M,56.0,Central,52.0,Central,Science,52.00,Sci&Tech,No,66.00,Mkt&HR,59.43,Not Placed,
5,6,M,55.0,Others,49.8,Others,Science,67.25,Sci&Tech,Yes,55.00,Mkt&Fin,51.58,Not Placed,
6,7,F,46.0,Others,49.2,Others,Commerce,79.00,Comm&Mgmt,No,74.28,Mkt&Fin,53.29,Not Placed,
9,10,M,58.0,Central,70.0,Central,Commerce,61.00,Comm&Mgmt,No,54.00,Mkt&Fin,52.21,Not Placed,
12,13,F,47.0,Central,55.0,Others,Science,65.00,Comm&Mgmt,No,62.00,Mkt&HR,65.04,Not Placed,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,199,F,67.0,Central,70.0,Central,Commerce,65.00,Others,No,88.00,Mkt&HR,71.96,Not Placed,
201,202,M,54.2,Central,63.0,Others,Science,58.00,Comm&Mgmt,No,79.00,Mkt&HR,58.44,Not Placed,
206,207,M,41.0,Central,42.0,Central,Science,60.00,Comm&Mgmt,No,97.00,Mkt&Fin,53.39,Not Placed,
208,209,F,43.0,Central,60.0,Others,Science,65.00,Comm&Mgmt,No,92.66,Mkt&HR,62.92,Not Placed,
