## Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Reading the dataset

In [2]:
df = pd.read_csv("Placement_Data_Full_Class.csv")
df

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


## Information of the dataset

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB


## Null values

In [4]:
df.isna().sum()

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

## Dealing with the Null values

In [5]:
df['salary'].fillna(df['salary'].mean(),inplace = True)

In [6]:
df.isna().sum()

sl_no             0
gender            0
ssc_p             0
ssc_b             0
hsc_p             0
hsc_b             0
hsc_s             0
degree_p          0
degree_t          0
workex            0
etest_p           0
specialisation    0
mba_p             0
status            0
salary            0
dtype: int64

## Dealing with categorical values 

In [7]:
df['gender'] = pd.get_dummies(df['gender'],drop_first=True)

In [8]:
df['ssc_b'] = pd.get_dummies(df['ssc_b'],drop_first=True)
df['hsc_b'] = pd.get_dummies(df['hsc_b'],drop_first=True)

In [9]:
df = pd.concat([df,pd.get_dummies(df['hsc_s'])],axis = 1)
del df['hsc_s']

In [10]:
df = pd.concat([df,pd.get_dummies(df['degree_t'])],axis = 1)
del df['degree_t']

In [11]:
df['workex'] = pd.get_dummies(df['workex'],drop_first=True)

In [12]:
df['Mkt&HR'] = pd.get_dummies(df['specialisation'],drop_first=True)
del df['specialisation']

In [13]:
df['status'] = pd.get_dummies(df['status'],drop_first=True)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sl_no      215 non-null    int64  
 1   gender     215 non-null    uint8  
 2   ssc_p      215 non-null    float64
 3   ssc_b      215 non-null    uint8  
 4   hsc_p      215 non-null    float64
 5   hsc_b      215 non-null    uint8  
 6   degree_p   215 non-null    float64
 7   workex     215 non-null    uint8  
 8   etest_p    215 non-null    float64
 9   mba_p      215 non-null    float64
 10  status     215 non-null    uint8  
 11  salary     215 non-null    float64
 12  Arts       215 non-null    uint8  
 13  Commerce   215 non-null    uint8  
 14  Science    215 non-null    uint8  
 15  Comm&Mgmt  215 non-null    uint8  
 16  Others     215 non-null    uint8  
 17  Sci&Tech   215 non-null    uint8  
 18  Mkt&HR     215 non-null    uint8  
dtypes: float64(6), int64(1), uint8(12)
memory usage: 1

## Correlation

In [15]:
df.corr()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,mba_p,status,salary,Arts,Commerce,Science,Comm&Mgmt,Others,Sci&Tech,Mkt&HR
sl_no,1.0,0.074306,-0.078155,0.027214,-0.085711,0.116887,-0.088281,0.059151,0.063636,0.022327,-0.02685942,0.0515504,0.014626,-0.024463,0.018201,-0.039982,0.066666,0.009069,0.04663
gender,0.074306,1.0,-0.068969,0.019429,-0.021334,0.065945,-0.173217,0.085153,0.084294,-0.300531,0.0906704,0.1291073,-0.093246,-0.001087,0.042682,-0.036215,-0.093246,0.084072,-0.10616
ssc_p,-0.078155,-0.068969,1.0,0.116194,0.511472,0.066996,0.538404,0.175675,0.261993,0.388478,0.6078887,0.02357072,-0.196086,-0.101323,0.189847,-0.179581,-0.066127,0.221235,-0.172536
ssc_b,0.027214,0.019429,0.116194,1.0,-0.137013,0.605883,0.03807,-0.040744,-0.018991,0.08312,0.03729651,0.004603322,-0.002758,-0.056671,0.058505,-0.094939,-0.002758,0.10106,-0.051565
hsc_p,-0.085711,-0.021334,0.511472,-0.137013,1.0,-0.019548,0.434206,0.141025,0.245113,0.354823,0.4912279,0.05450558,-0.076627,0.258093,-0.226675,0.112816,-0.131382,-0.053599,-0.24163
hsc_b,0.116887,0.065945,0.066996,0.605883,-0.019548,1.0,0.067229,0.038357,0.039108,0.090201,0.01694454,-0.00624714,-0.11692,-0.073522,0.126446,-0.02744,-0.11692,0.086548,0.002232
degree_p,-0.088281,-0.173217,0.538404,0.03807,0.434206,0.067229,1.0,0.122648,0.22447,0.402364,0.479861,-0.01414817,-0.154817,-0.022724,0.092006,-0.032755,-0.180408,0.123477,-0.218286
workex,0.059151,0.085153,0.175675,-0.040744,0.141025,0.038357,0.122648,1.0,0.056735,0.168811,0.27606,0.1184562,0.053939,-0.056719,0.03327,-0.102517,0.009507,0.102962,-0.191174
etest_p,0.063636,0.084294,0.261993,-0.018991,0.245113,0.039108,0.22447,0.056735,1.0,0.218055,0.1276394,0.1528286,-0.074871,-0.021732,0.055352,-0.012531,0.006907,0.009748,-0.236315
mba_p,0.022327,-0.300531,0.388478,0.08312,0.354823,0.090201,0.402364,0.168811,0.218055,1.0,0.07692165,0.1463245,0.004244,-0.049415,0.04805,-0.102088,-0.036055,0.125009,-0.105728


## Finding the factor that effected the candidate in getting placed

In [16]:
corr_target = abs(df.corr()['status'])
relevent_features = corr_target[corr_target>0.25]
relevent_features

ssc_p       0.607889
hsc_p       0.491228
degree_p    0.479861
workex      0.276060
status      1.000000
Mkt&HR      0.250655
Name: status, dtype: float64

**Therefore the ssc percentage factor effected the candidate in getting placed**

## Checking if percentage matters for a candidate to get placed

In [17]:
corr_target = abs(df.corr()['status'])
relevent_features = corr_target[corr_target>0.25]
relevent_features.head(3)

ssc_p       0.607889
hsc_p       0.491228
degree_p    0.479861
Name: status, dtype: float64

#### Therefore we can conclude percentages matter the most for the candidate to get placed

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sl_no      215 non-null    int64  
 1   gender     215 non-null    uint8  
 2   ssc_p      215 non-null    float64
 3   ssc_b      215 non-null    uint8  
 4   hsc_p      215 non-null    float64
 5   hsc_b      215 non-null    uint8  
 6   degree_p   215 non-null    float64
 7   workex     215 non-null    uint8  
 8   etest_p    215 non-null    float64
 9   mba_p      215 non-null    float64
 10  status     215 non-null    uint8  
 11  salary     215 non-null    float64
 12  Arts       215 non-null    uint8  
 13  Commerce   215 non-null    uint8  
 14  Science    215 non-null    uint8  
 15  Comm&Mgmt  215 non-null    uint8  
 16  Others     215 non-null    uint8  
 17  Sci&Tech   215 non-null    uint8  
 18  Mkt&HR     215 non-null    uint8  
dtypes: float64(6), int64(1), uint8(12)
memory usage: 1

## Degree Specialisation wise number of placements

In [26]:
print('Specilasation wise number of placements:')
print('-'*40)
print('Marketting and HR : ',df[df['Mkt&HR']==1]['status'].sum())
print('Marketting and Finance : ',df[df['Mkt&HR']==0]['status'].sum())
print('-'*40)

Specilasation wise number of placements:
----------------------------------------
Marketting and HR :  53
Marketting and Finance :  95
----------------------------------------
