In [1]:
#importing library
import numpy as np
import pandas as pd
from  sklearn.model_selection import train_test_split
from  sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [2]:
#loading the dataset 
df=pd.read_csv('../input/engineering-graduate-salary-prediction/Engineering_graduate_salary.csv')
#showing the dataset
df

Unnamed: 0,ID,Gender,DOB,10percentage,10board,12graduation,12percentage,12board,CollegeID,CollegeTier,...,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary
0,604399,f,1990-10-22,87.80,cbse,2009,84.00,cbse,6920,1,...,-1,-1,-1,-1,-0.1590,0.3789,1.2396,0.14590,0.2889,445000
1,988334,m,1990-05-15,57.00,cbse,2010,64.50,cbse,6624,2,...,-1,-1,-1,-1,1.1336,0.0459,1.2396,0.52620,-0.2859,110000
2,301647,m,1989-08-21,77.33,"maharashtra state board,pune",2007,85.17,amravati divisional board,9084,2,...,-1,-1,260,-1,0.5100,-0.1232,1.5428,-0.29020,-0.2875,255000
3,582313,m,1991-05-04,84.30,cbse,2009,86.00,cbse,8195,1,...,-1,-1,-1,-1,-0.4463,0.2124,0.3174,0.27270,0.4805,420000
4,339001,f,1990-10-30,82.00,cbse,2008,75.00,cbse,4889,2,...,-1,-1,-1,-1,-1.4992,-0.7473,-1.0697,0.06223,0.1864,200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2993,103174,f,1989-04-17,75.00,0,2005,73.00,0,1263,2,...,-1,-1,-1,-1,-1.1901,0.9688,-1.0697,1.35490,0.0284,120000
2994,352811,f,1991-07-22,84.00,state board,2008,77.00,state board,9481,2,...,-1,-1,-1,-1,-0.1082,0.0328,-0.4891,-0.29020,0.5024,120000
2995,287070,m,1988-11-24,91.40,bsemp,2006,65.56,bsemp,547,2,...,-1,-1,-1,-1,-0.8810,0.1888,-0.3440,0.06230,0.6603,385000
2996,317336,m,1988-08-25,88.64,karnataka education board,2006,65.16,karnataka education board,1629,2,...,-1,-1,-1,-1,1.4374,1.2808,-0.4891,-1.46537,0.5419,530000


In [3]:
#getting information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2998 entries, 0 to 2997
Data columns (total 34 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     2998 non-null   int64  
 1   Gender                 2998 non-null   object 
 2   DOB                    2998 non-null   object 
 3   10percentage           2998 non-null   float64
 4   10board                2998 non-null   object 
 5   12graduation           2998 non-null   int64  
 6   12percentage           2998 non-null   float64
 7   12board                2998 non-null   object 
 8   CollegeID              2998 non-null   int64  
 9   CollegeTier            2998 non-null   int64  
 10  Degree                 2998 non-null   object 
 11  Specialization         2998 non-null   object 
 12  collegeGPA             2998 non-null   float64
 13  CollegeCityID          2998 non-null   int64  
 14  CollegeCityTier        2998 non-null   int64  
 15  Coll

In [51]:
#creating preprocessing function
def preprocessing (df):
    df=df.copy()
    #Droping Id column since it is not giving any informatin
    df=df.drop('ID',axis=1)
    #binary encoding gender column
    df['Gender']=df['Gender'].replace({'m':1,'f':0})
    #converting dob column to datetime column
    df['DOB']=pd.to_datetime(df['DOB'])
    #extracting year,month and day from dob column
    df['DOB_year']=df['DOB'].apply(lambda x:x.year)
    df['DOB_month']=df['DOB'].apply(lambda x:x.month)
    df['DOB_day']=df['DOB'].apply(lambda x:x.day)
    df=df.drop('DOB',axis=1)
    #one hot encode
    for column in ['10board','12board','Degree','Specialization','CollegeState']:
        df=onehot_encode(df,column)
    #encoding and filling missing  values
    df=df.replace(-1,np.NaN)
    missing_column=[column for column in df.columns if df.isna().sum()[column]>0]
    for column in missing_column:
        df[column]=df[column].fillna(df[column].mean())
    #Spliting the data into x and y
    y=df['Salary']
    x=df.drop('Salary',axis=1)
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=1)
    #Scaling the data
    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x.columns)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x.columns)

    
    return x_train,x_test,y_train,y_test

In [33]:
def onehot_encode(df,column):
    df=df.copy()
    dummies=pd.get_dummies(df[column],prefix=column)
    df=pd.concat([df,dummies],axis=1)
    df=df.drop(column,axis=1)
    return df


In [26]:
#checking the unique values in each column in object
{column:len(x[column].unique()) for column in x.select_dtypes('object').columns}

{'10board': 221,
 '12board': 277,
 'Degree': 4,
 'Specialization': 42,
 'CollegeState': 26}

In [27]:
x['Degree'].unique()

array(['B.Tech/B.E.', 'M.Tech./M.E.', 'MCA', 'M.Sc. (Tech.)'],
      dtype=object)

In [28]:
x['Specialization'].unique()

array(['instrumentation and control engineering',
       'computer science & engineering',
       'electronics & telecommunications', 'biotechnology',
       'mechanical engineering', 'information technology',
       'electronics and communication engineering',
       'computer engineering', 'computer application',
       'computer science and technology', 'electrical engineering',
       'automobile/automotive engineering',
       'electronics and electrical engineering',
       'information science engineering', 'chemical engineering',
       'instrumentation engineering', 'electronics & instrumentation eng',
       'ceramic engineering', 'metallurgical engineering',
       'aeronautical engineering', 'electronics engineering',
       'electronics and instrumentation engineering',
       'applied electronics and instrumentation', 'civil engineering',
       'computer and communication engineering',
       'industrial & production engineering', 'computer networking',
       'other', '

In [52]:
x_train,x_test,y_train,y_test=preprocessing(df)
x_train

Unnamed: 0,Gender,10percentage,12graduation,12percentage,CollegeID,CollegeTier,collegeGPA,CollegeCityID,CollegeCityTier,GraduationYear,...,CollegeState_Orissa,CollegeState_Punjab,CollegeState_Rajasthan,CollegeState_Sikkim,CollegeState_Tamil Nadu,CollegeState_Telangana,CollegeState_Union Territory,CollegeState_Uttar Pradesh,CollegeState_Uttarakhand,CollegeState_West Bengal
0,0.57258,-0.357629,-1.894168,-0.701246,-1.045708,0.281457,-1.132292,-1.045708,1.538693,-1.987117,...,-0.212935,-0.230679,4.896549,-0.03089,-0.310054,-0.294081,-0.048877,-0.539515,-0.168616,-0.229531
1,0.57258,-0.470995,-1.279099,-0.685117,-1.071061,0.281457,-0.210547,-1.071061,-0.649902,-1.219068,...,-0.212935,-0.230679,-0.204225,-0.03089,-0.310054,-0.294081,-0.048877,1.853515,-0.168616,-0.229531
2,0.57258,1.080330,1.181180,0.788879,-1.007573,0.281457,1.779528,-1.007573,-0.649902,1.085080,...,-0.212935,-0.230679,-0.204225,-0.03089,-0.310054,-0.294081,-0.048877,1.853515,-0.168616,-0.229531
3,0.57258,0.125668,-0.664029,-1.652847,-0.150169,0.281457,-1.350927,-0.150169,-0.649902,-1.219068,...,-0.212935,-0.230679,-0.204225,-0.03089,-0.310054,-0.294081,-0.048877,1.853515,-0.168616,-0.229531
4,0.57258,0.141579,-2.509238,0.264692,-1.068756,0.281457,-1.152168,-1.068756,-0.649902,-1.987117,...,-0.212935,-0.230679,-0.204225,-0.03089,-0.310054,-0.294081,-0.048877,1.853515,-0.168616,-0.229531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2093,0.57258,-0.537623,0.566110,-0.667196,-0.147864,0.281457,-0.103714,-0.147864,-0.649902,0.317031,...,-0.212935,-0.230679,-0.204225,-0.03089,-0.310054,-0.294081,-0.048877,1.853515,-0.168616,-0.229531
2094,0.57258,0.026224,-0.048959,-0.936010,1.346726,0.281457,-1.698755,1.346726,-0.649902,0.317031,...,-0.212935,-0.230679,-0.204225,-0.03089,-0.310054,-0.294081,-0.048877,-0.539515,-0.168616,-0.229531
2095,0.57258,-0.570439,0.566110,-1.115219,0.512161,0.281457,-0.332287,0.512161,-0.649902,0.317031,...,-0.212935,-0.230679,-0.204225,-0.03089,-0.310054,-0.294081,-0.048877,1.853515,-0.168616,-0.229531
2096,0.57258,-0.470995,-0.048959,-0.039964,-0.823185,0.281457,-0.307442,-0.823185,1.538693,-0.451018,...,-0.212935,-0.230679,-0.204225,-0.03089,3.225240,-0.294081,-0.048877,-0.539515,-0.168616,-0.229531


In [54]:
#Training the model
model=LinearRegression()
model.fit(x_train,y_train)
model.score(x_test,y_test)

-3.227921821351592e+27

In [57]:
#XGB regressor
xgb_model=XGBRegressor()
xgb_model.fit(x_train,y_train)
xgb_model.score(x_test,y_test)

0.0876383428288453

In [22]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2998 entries, 0 to 2997
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Gender                 2998 non-null   int64         
 1   DOB                    2998 non-null   datetime64[ns]
 2   10percentage           2998 non-null   float64       
 3   10board                2998 non-null   object        
 4   12graduation           2998 non-null   int64         
 5   12percentage           2998 non-null   float64       
 6   12board                2998 non-null   object        
 7   CollegeID              2998 non-null   int64         
 8   CollegeTier            2998 non-null   int64         
 9   Degree                 2998 non-null   object        
 10  Specialization         2998 non-null   object        
 11  collegeGPA             2998 non-null   float64       
 12  CollegeCityID          2998 non-null   int64         
 13  Col

In [42]:
x.isna().sum().sum()

0