In [3]:
#importing library
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

In [4]:
#loading the dataset
df=pd.read_csv('../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')
#showing the dataset
df

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


In [5]:
#getting informatin about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB


In [32]:
#creating a function to encode binary column
def binary_encode(df,column_dict):
    #copying the dataframe to a variable
    df=df.copy()
    #looping through the dictionary
    for column,positive_value in column_dict.items():
        #applying lambda function to encode values
        df[column]=df[column].apply(lambda x:1 if x==positive_value else 0)
    #returning the dataframe
    return df

    

#creating a function to onehot_encode
def onehot_encode(df,column_dict):
    #copying the dataframe
    df=df.copy()
    #looping through the dictionary
    for column,prefix in column_dict.items():
        #creating dummies dataframe from original dataframe
        dummies=pd.get_dummies(df[column],prefix=prefix)
        #concating the dummies dataframe to original dataframe
        df=pd.concat([df,dummies],axis=1)
        #dropping the original column
        df=df.drop(column,axis=1)
    #returning the dataframe
    return df

    

In [59]:
#Preprocessing 
def preprocess_inputs(df):
    df=df.copy()
    #creating dictionary with binary features
    binary_feature_dict={
        'gender':'M',
        'ssc_b': 'Central',
        'hsc_b':'Central',
        'workex':'Yes',
        'specialisation':'Mkt&Fin',
        'status':'Placed'
    }
    #creating dictionary with nominal features
    nominal_feature_dict={
        'hsc_s':'hsc',
        'degree_t':'deg'
    }
    #applying function
    df=binary_encode(df,binary_feature_dict)
    df=onehot_encode(df,nominal_feature_dict)
    #rows with missing salaries and storing it in a variable
    missing_salaries=df[df.isna().sum(axis=1)>0]
    # dropping the salary column from the missing_salaries dataframe
    missing_salaries=missing_salaries.drop('salary',axis=1)
    
    missing_salary_ids=missing_salaries['sl_no'].copy()
    #dropping the missing salary rows
    df=df.drop(missing_salaries.index,axis=0).reset_index(drop=True)
    #dropping the 'sl_no' column
    df=df.drop('sl_no',axis=1)
    #dropping the serial number from sl_no  missing_salaries dataframe
    missing_salaries=missing_salaries.drop('sl_no',axis=1)
    #returning the dataframe
    #Splitting df into x and y
    y=df['salary'].copy()
    x=df.drop('salary',axis=1).copy()
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=123)
    #scaler x
    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=scaler.transform(x_train)
    x_test=scaler.transform(x_test)
    missing_salaries=scaler.transform(missing_salaries)
    return x_train,x_test,y_train,y_test,missing_salaries,missing_salary_ids

In [51]:
x.isna().sum(axis=0).sum()

0

In [53]:
missing_salaries

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,status,hsc_Arts,hsc_Commerce,hsc_Science,deg_Comm&Mgmt,deg_Others,deg_Sci&Tech
3,4,1,56.0,1,52.0,1,52.00,0,66.00,0,59.43,0,0,0,1,0,0,1
5,6,1,55.0,0,49.8,0,67.25,1,55.00,1,51.58,0,0,0,1,0,0,1
6,7,0,46.0,0,49.2,0,79.00,0,74.28,1,53.29,0,0,1,0,1,0,0
9,10,1,58.0,1,70.0,1,61.00,0,54.00,1,52.21,0,0,1,0,1,0,0
12,13,0,47.0,1,55.0,0,65.00,0,62.00,0,65.04,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,199,0,67.0,1,70.0,1,65.00,0,88.00,0,71.96,0,0,1,0,0,1,0
201,202,1,54.2,1,63.0,0,58.00,0,79.00,0,58.44,0,0,0,1,1,0,0
206,207,1,41.0,1,42.0,1,60.00,0,97.00,1,53.39,0,0,0,1,1,0,0
208,209,0,43.0,1,60.0,0,65.00,0,92.66,0,62.92,0,0,0,1,1,0,0


In [61]:
x_train,x_test,y_train,y_test,missing_salaries,missing_salaries_id=preprocess_inputs(df)
#showing the dataset
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(103, 17)
(45, 17)
(103,)
(45,)


In [11]:
#creating unique list in each columns
{column: list(x[column].unique()) for column in x.select_dtypes('object').columns}

{'gender': ['M', 'F'],
 'ssc_b': ['Others', 'Central'],
 'hsc_b': ['Others', 'Central'],
 'hsc_s': ['Commerce', 'Science', 'Arts'],
 'degree_t': ['Sci&Tech', 'Comm&Mgmt', 'Others'],
 'workex': ['No', 'Yes'],
 'specialisation': ['Mkt&HR', 'Mkt&Fin'],
 'status': ['Placed', 'Not Placed']}

In [37]:
x[x.isna().sum(axis=1)>0]

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,status,salary,hsc_Arts,hsc_Commerce,hsc_Science,deg_Comm&Mgmt,deg_Others,deg_Sci&Tech
3,4,1,56.0,1,52.0,1,52.00,0,66.00,0,59.43,0,,0,0,1,0,0,1
5,6,1,55.0,0,49.8,0,67.25,1,55.00,1,51.58,0,,0,0,1,0,0,1
6,7,0,46.0,0,49.2,0,79.00,0,74.28,1,53.29,0,,0,1,0,1,0,0
9,10,1,58.0,1,70.0,1,61.00,0,54.00,1,52.21,0,,0,1,0,1,0,0
12,13,0,47.0,1,55.0,0,65.00,0,62.00,0,65.04,0,,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,199,0,67.0,1,70.0,1,65.00,0,88.00,0,71.96,0,,0,1,0,0,1,0
201,202,1,54.2,1,63.0,0,58.00,0,79.00,0,58.44,0,,0,0,1,1,0,0
206,207,1,41.0,1,42.0,1,60.00,0,97.00,1,53.39,0,,0,0,1,1,0,0
208,209,0,43.0,1,60.0,0,65.00,0,92.66,0,62.92,0,,0,0,1,1,0,0


In [65]:
#trainig the model
model=Ridge(alpha=100.0)
model.fit(x_train,y_train)
model_r2=model.score(x_test,y_test)

In [67]:
#printing the final score
print('Model r2 {}'.format(model_r2))

Model r2 -0.004891521468379567
