# Employee Salary Prediction using Machine Learning
This project predicts employee salaries based on experience, education, job role, location, and performance rating using Linear Regression and Random Forest.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
import random

roles = ['Developer', 'Manager', 'Analyst']
education = ['Bachelor', 'Master', 'PhD']
locations = ['New York', 'Delhi', 'London']

data = []

for _ in range(500):
    exp = random.randint(1, 20)
    edu = random.choice(education)
    role = random.choice(roles)
    loc = random.choice(locations)
    rating = random.randint(1, 5)

    base_salary = 30000
    salary = base_salary + (exp * 2500) + (rating * 2000)

    if edu == 'Master':
        salary += 5000
    elif edu == 'PhD':
        salary += 10000

    if role == 'Manager':
        salary += 10000
    elif role == 'Analyst':
        salary += 5000

    salary += random.randint(-3000, 3000)  # Noise

    data.append([exp, edu, role, loc, rating, salary])

df = pd.DataFrame(data, columns=['Experience', 'Education', 'JobRole', 'Location', 'PerformanceRating', 'Salary'])
df.to_csv('employee_data.csv', index=False)
df.head()

In [None]:
df = pd.read_csv('employee_data.csv')

# Encode categorical features
le = LabelEncoder()
df['Education'] = le.fit_transform(df['Education'])
df['JobRole'] = le.fit_transform(df['JobRole'])
df['Location'] = le.fit_transform(df['Location'])

In [None]:
X = df.drop('Salary', axis=1)
y = df['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)

print("Linear Regression R2 Score:", r2_score(y_test, lr_preds))
print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_test, lr_preds)))

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print("Random Forest R2 Score:", r2_score(y_test, rf_preds))
print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, rf_preds)))

In [None]:
importances = rf.feature_importances_
features = X.columns

plt.figure(figsize=(8, 5))
sns.barplot(x=importances, y=features)
plt.title("Feature Importance")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.show()