In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline

In [7]:
df = pd.read_csv('dataset.csv')

In [8]:
df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0


In [9]:
df['Education Level'] = df['Education Level'].replace({
    "Master's Degree": "Master's",
    "Bachelor's Degree": "Bachelor's",
    "PhD": "PHD",
    "phD": "PHD"
})

In [10]:
df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PHD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,Female,PHD,Director of Marketing,20.0,200000.0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0
6701,30.0,Female,Bachelor's,Financial Manager,4.0,55000.0
6702,46.0,Male,Master's,Marketing Manager,14.0,140000.0


In [11]:
sorted(df["Gender"].dropna().unique().tolist())

['Female', 'Male', 'Other']

In [12]:
sorted(df["Education Level"].dropna().unique().tolist())

["Bachelor's", 'High School', "Master's", 'PHD']

In [13]:
sorted(df["Job Title"].dropna().unique().tolist())

['Account Manager',
 'Accountant',
 'Administrative Assistant',
 'Back end Developer',
 'Business Analyst',
 'Business Development Manager',
 'Business Intelligence Analyst',
 'CEO',
 'Chief Data Officer',
 'Chief Technology Officer',
 'Content Marketing Manager',
 'Copywriter',
 'Creative Director',
 'Customer Service Manager',
 'Customer Service Rep',
 'Customer Service Representative',
 'Customer Success Manager',
 'Customer Success Rep',
 'Data Analyst',
 'Data Entry Clerk',
 'Data Scientist',
 'Delivery Driver',
 'Developer',
 'Digital Content Producer',
 'Digital Marketing Manager',
 'Digital Marketing Specialist',
 'Director',
 'Director of Business Development',
 'Director of Data Science',
 'Director of Engineering',
 'Director of Finance',
 'Director of HR',
 'Director of Human Capital',
 'Director of Human Resources',
 'Director of Marketing',
 'Director of Operations',
 'Director of Product Management',
 'Director of Sales',
 'Director of Sales and Marketing',
 'Event Coord

In [15]:
df.to_csv("fdataset.csv", index=False)

In [16]:
numeric_cols = ['Age', 'Years of Experience']
categorical_cols = ['Gender', 'Education Level', 'Job Title']

In [17]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [18]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

In [19]:
preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols),
    ('num', numerical_transformer, numeric_cols)
])

In [20]:
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('pca', PCA(n_components=58)),
    ('model', LinearRegression())
])

In [21]:
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())

In [22]:
x = df.drop(columns=['Salary'])
y = df['Salary']

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [24]:
x_train.columns

Index(['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience'], dtype='object')

In [25]:
pipeline.fit(x_train, y_train)

In [26]:
y_pred = pipeline.predict(x_test)

In [27]:
r2_score(y_test, y_pred)

0.8680280036187573

In [28]:
new_data = pd.DataFrame({
    'Age': [35],
    'Gender': ['Male'],
    'Education Level': ["Master's"],
    'Job Title': ['Data Scientist'],
    'Years of Experience': [7]
})

In [29]:
pipeline.predict(new_data)

array([137434.51421905])

In [30]:
import joblib

In [31]:
joblib.dump(pipeline, 'predict_salary.pkl')

['predict_salary.pkl']