In [109]:
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from flask import Flask, request, jsonify
import joblib

In [110]:

np.random.seed(42)


us_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
job_roles = ['Engineer', 'Manager', 'Technician', 'Clerk', 'Sales', 'HR', 'Marketing', 'Consultant', 'Analyst', 'Executive']

# Generate dummy data
n_samples = 1000
data = {
    'region': np.random.choice(us_states, n_samples),
    'jobRole': np.random.choice(job_roles, n_samples),
    'salary': np.random.uniform(30000, 120000, n_samples)
}


dummy_data = pd.DataFrame(data)


dummy_data['salary'] = dummy_data['salary'].apply(lambda x: f"${x:,.2f}")


display(dummy_data)

Unnamed: 0,region,jobRole,salary
0,Rhode Island,Analyst,"$99,166.74"
1,New Hampshire,Engineer,"$40,179.88"
2,Iowa,Sales,"$101,595.93"
3,Texas,HR,"$112,681.00"
4,Delaware,Sales,"$84,054.86"
...,...,...,...
995,Montana,Manager,"$75,670.17"
996,North Dakota,Engineer,"$90,964.86"
997,Vermont,Executive,"$51,153.35"
998,Colorado,Consultant,"$104,761.97"


In [111]:
# Encode categorical variables
label_encoder_region = LabelEncoder()
label_encoder_jobRole = LabelEncoder()

dummy_data['region'] = label_encoder_region.fit_transform(dummy_data['region'])
dummy_data['jobRole'] = label_encoder_jobRole.fit_transform(dummy_data['jobRole'])

# Remove special characters from 'salary' and convert to float
dummy_data['salary'] = dummy_data['salary'].replace('[\$,]', '', regex=True).astype(float)

display(dummy_data.head())

  dummy_data['salary'] = dummy_data['salary'].replace('[\$,]', '', regex=True).astype(float)


Unnamed: 0,region,jobRole,salary
0,38,0,99166.74
1,28,3,40179.88
2,14,8,101595.93
3,42,5,112681.0
4,7,8,84054.86


In [112]:

X = dummy_data[['region', 'jobRole']]
y = dummy_data['salary']


In [113]:
# Splitting the dummy data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.06, random_state=42)

#better results if we use different techniques for randomness of data
#trial and error with test_size


In [114]:
# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [115]:

# Saving models so we don't have to retrain
joblib.dump(model, 'salary_model.pkl')
joblib.dump(label_encoder_region, 'label_encoder_region.pkl')
joblib.dump(label_encoder_jobRole, 'label_encoder_jobRole.pkl')



['label_encoder_jobRole.pkl']

In [116]:
#predicting on the given values
y_pred = model.predict(X_test)



In [117]:
# Evaluating the model
mse = np.mean((y_test - y_pred) ** 2)
r2 = model.score(X_test, y_test)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

#r score is very close to zero which means explains almost none of the variability

Mean Squared Error: 584589730.3073157
R^2 Score: 0.0004957733239845385
