In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, PowerTransformer, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import VotingClassifier,AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers import Dense



In [2]:
df = pd.read_csv(r'employee_salary_dataset.csv')
df

Unnamed: 0,EmployeeID,Name,Department,Experience_Years,Education_Level,Age,Gender,City,Monthly_Salary
0,1,Employee_1,Marketing,15,Master,53,Female,Delhi,111416
1,2,Employee_2,Operations,7,Bachelor,25,Female,Bangalore,95271
2,3,Employee_3,IT,12,High School,51,Female,Hyderabad,69064
3,4,Employee_4,Operations,8,PhD,44,Male,Delhi,95091
4,5,Employee_5,Operations,15,Master,36,Female,Delhi,132450
5,6,Employee_6,Finance,3,High School,50,Male,Mumbai,65818
6,7,Employee_7,IT,14,PhD,57,Male,Mumbai,70525
7,8,Employee_8,IT,17,PhD,34,Female,Bangalore,44830
8,9,Employee_9,IT,4,Bachelor,53,Male,Hyderabad,42429
9,10,Employee_10,Operations,18,High School,28,Male,Mumbai,31893


In [3]:
df.drop(columns =['EmployeeID','Name'], inplace = True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Department        50 non-null     object
 1   Experience_Years  50 non-null     int64 
 2   Education_Level   50 non-null     object
 3   Age               50 non-null     int64 
 4   Gender            50 non-null     object
 5   City              50 non-null     object
 6   Monthly_Salary    50 non-null     int64 
dtypes: int64(3), object(4)
memory usage: 2.9+ KB


In [7]:
df.isnull().sum()
df.duplicated().sum()

np.int64(0)

In [8]:
objcols = df.select_dtypes(include=['object']).columns
numcols = df.select_dtypes(include=['number']).columns


In [9]:
print(objcols)
print(numcols)

Index(['Department', 'Education_Level', 'Gender', 'City'], dtype='object')
Index(['Experience_Years', 'Age', 'Monthly_Salary'], dtype='object')


In [10]:
le = LabelEncoder()
df['Education_Level'] = le.fit_transform(df['Education_Level'])
df['Gender'] = le.fit_transform(df['Gender'])


In [13]:
df = pd.get_dummies(df)

In [16]:
X = df.drop(columns=['Monthly_Salary'])
Y = df['Monthly_Salary']

In [17]:
st = StandardScaler()
X_Scaled = st.fit_transform(X)

In [18]:
x_train, x_test, y_train, y_test = train_test_split(X_Scaled,Y,test_size=0.2,random_state=42)

In [39]:
pca = PCA(n_components = 0.90)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)


In [40]:
lr = LinearRegression()
dt = DecisionTreeRegressor()
knn = KNeighborsRegressor()


In [47]:
models = {
    'Lr':lr, 'DT':dt, 'KNN':knn
}

for i,j in models.items():
    j.fit(x_train,y_train)
    y_pred_train = j.predict(x_train)
    y_pred_test = j.predict(x_test)

    train_acc = mean_squared_error(y_train, y_pred_train)
    test_acc  = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    print("====================================")
    print(f"Model: {i}")
    print(f"MSE: {train_acc:.4f}")
    print(f"RMSE : {test_acc:.4f}")

Model: Lr
MSE: 0.0563
RMSE : 0.2503
Model: DT
MSE: 0.0000
RMSE : 0.3271
Model: KNN
MSE: 0.0669
RMSE : 0.2846


In [43]:
model = Sequential()
model.add(Dense(units=20, input_dim = x_train.shape[1] ,activation = 'relu'))
model.add(Dense(units = 20,activation = 'relu'))
model.add(Dense(units=1,activation='linear'))
model.compile(loss='mean_squared_error',optimizer = 'adam')

In [44]:
model.fit(x_train,y_train,epochs = 15, batch_size = 8 , validation_data = (x_test,y_test))

Epoch 1/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - loss: 0.5601 - val_loss: 0.3724
Epoch 2/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 0.4394 - val_loss: 0.3217
Epoch 3/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 0.3629 - val_loss: 0.2845
Epoch 4/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 0.2959 - val_loss: 0.2581
Epoch 5/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 0.2406 - val_loss: 0.2383
Epoch 6/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 0.2115 - val_loss: 0.2222
Epoch 7/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 0.1873 - val_loss: 0.2109
Epoch 8/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - loss: 0.1681 - val_loss: 0.2014
Epoch 9/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x1bc784b5b50>

In [46]:
y_pred = model.predict(x_test)
print(mean_squared_error(y_test,y_pred))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
0.17129807905525424
