In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/ingledarshan/DS_C24_NLP_DL/main/hiring.csv")
df

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [3]:
df.isna().sum()

experience         2
test_score         1
interview_score    0
salary             0
dtype: int64

In [4]:
df.experience.fillna(0, inplace=True)

In [5]:
df.isna().sum()

experience         0
test_score         1
interview_score    0
salary             0
dtype: int64

In [6]:
df.test_score.mean()

7.857142857142857

In [7]:
df.test_score.fillna(df.test_score.mean(), inplace=True)

In [8]:
df.isna().sum()

experience         0
test_score         0
interview_score    0
salary             0
dtype: int64

# Dataset is clean now.

In [9]:
df

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,7.857143,7,72000
7,eleven,7.0,8,80000


In [10]:
X = df.iloc[:,:-1]
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,five,6.0,7
3,two,10.0,10
4,seven,9.0,6
5,three,7.0,10
6,ten,7.857143,7
7,eleven,7.0,8


In [12]:
X.shape

(8, 3)

In [13]:
y = df.iloc[:,-1]
y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary, dtype: int64

In [14]:
y.shape

(8,)

In [15]:
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,five,6.0,7
3,two,10.0,10
4,seven,9.0,6
5,three,7.0,10
6,ten,7.857143,7
7,eleven,7.0,8


In [16]:
X.experience

0         0
1         0
2      five
3       two
4     seven
5     three
6       ten
7    eleven
Name: experience, dtype: object

In [17]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       8 non-null      object 
 1   test_score       8 non-null      float64
 2   interview_score  8 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 320.0+ bytes


In [18]:
X.experience

0         0
1         0
2      five
3       two
4     seven
5     three
6       ten
7    eleven
Name: experience, dtype: object

In [19]:
# Convert text in the cols to integer values

def conv(x):
    dict = {
        'two':2,
        'three':3,
        'five':5,
        'seven':7,
        'ten':10,
        'eleven':11,
         0:0
    }
    return dict[x]

In [20]:
X.experience = X.experience.apply(lambda x : conv(x))

In [21]:
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6
5,3,7.0,10
6,10,7.857143,7
7,11,7.0,8


In [22]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       8 non-null      int64  
 1   test_score       8 non-null      float64
 2   interview_score  8 non-null      int64  
dtypes: float64(1), int64(2)
memory usage: 320.0 bytes


# X is ready.

I am not going to do traintestsplit as the dataset itself is very small.

However, you all are encouraged to do it as a part of practice by choosing any Assignments that you have successfully accomplished till now.

In [23]:
# Modeling - Linear Regression

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [24]:
# Fit the model

lr.fit(X,y)

LinearRegression()

In [25]:
# Predict on X

y_pred = lr.predict(X)
y_pred

array([52313.61238494, 45722.68644263, 58231.95591138, 63991.7318464 ,
       67429.06277517, 61080.55179794, 75922.72532666, 79307.67351488])

In [26]:
y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary, dtype: int64

In [35]:
f = pd.DataFrame(y)
f['Predicted'] = y_pred
f

Unnamed: 0,salary,Predicted
0,50000,52313.612385
1,45000,45722.686443
2,60000,58231.955911
3,65000,63991.731846
4,70000,67429.062775
5,62000,61080.551798
6,72000,75922.725327
7,80000,79307.673515


In [36]:
f.columns

Index(['salary', 'Predicted'], dtype='object')

In [37]:
f.columns = [['Actual', 'Predicted']]

In [38]:
f

Unnamed: 0,Actual,Predicted
0,50000,52313.612385
1,45000,45722.686443
2,60000,58231.955911
3,65000,63991.731846
4,70000,67429.062775
5,62000,61080.551798
6,72000,75922.725327
7,80000,79307.673515


In [39]:
from sklearn.metrics import r2_score
r2_score(y_pred, y)

0.9626511210293308

In [42]:
df

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,7.857143,7,72000
7,eleven,7.0,8,80000


In [43]:
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6
5,3,7.0,10
6,10,7.857143,7
7,11,7.0,8


In [41]:
lr.predict([[3,9,7]])

array([58315.5019167])

In [44]:
lr.predict([[10,10,10]])

array([86612.80419155])

In [45]:
lr.predict([[10,2,3]])

array([55930.47274854])

# Model Deployment

In [46]:
import pickle

pickle.dump(lr,open('model.pkl','wb'))
# Dump this model by the name "model.pkl" in the systems HDD and
# while doing this, write this file using "write bytes" mode.

# Client Side - Anagha's side

In [47]:
anagha_model = pickle.load(open("model.pkl","rb"))

In [48]:
anagha_model.predict([[3,9,7]])

array([58315.5019167])

In [49]:
anagha_model.predict([[10,10,10]])

array([86612.80419155])

In [50]:
anagha_model.predict([[10,2,3]])

array([55930.47274854])

# Happy Learning