In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('salaries.csv')

# **EDA**

In [None]:
"""Displaying shape of the dataset"""
df.shape

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
df['job_title'].unique()

In [None]:
df['job_title'].count()

In [None]:
df['experience_level'].unique()

In [None]:
df['experience_level'].count()

In [None]:
plt.hist(df['salary_in_usd'],bins=30)

In [None]:
top_jobs = df['job_title'].value_counts().head(5).index
subset = df[df['job_title'].isin(top_jobs)]
plt.figure(figsize=(8,5))
plt.scatter(subset['job_title'],subset['salary_in_usd'])

In [None]:
sns.scatterplot(x = 'experience_level',y = 'salary_in_usd',data = df)

In [None]:
# Salary_in_usd vs. remote_ratio
sns.scatterplot(x='remote_ratio',y='salary_in_usd',data = df)

In [None]:
sns.scatterplot(x='company_size',y='salary_in_usd',data = df)

In [None]:
df.info()

In [None]:
sns.heatmap(df.corr(numeric_only=True),annot = True)

In [None]:
#Distribution of work_year
plt.hist(df['work_year'],bins=5)

In [None]:
#Count size according to companies
sns.countplot(x=df['company_size'])

In [None]:
#Ek box plot jisme remote_ratio aur salary_in_usd ko compare karwao
sns.boxplot(x=df['remote_ratio'],y=df['salary_in_usd'])

In [None]:
#Stipplot compare company_size with salary_in_usd
sns.stripplot(x=df['company_size'],y=df['salary_in_usd'])

In [None]:
# #Encoding Categorical features
# from sklearn.preprocessing import LabelEncoder
# l = ['experience_level','employment_type','job_title','salary_currency','employee_residence','company_location','company_size']
# le = LabelEncoder()
# for i in l:
#   df[i] = le.fit_transform(df[i])

In [None]:
df.info()

In [None]:
X = df.drop(columns=['salary_in_usd'])
y = df['salary_in_usd']

In [None]:
X = pd.get_dummies(X)

In [None]:
df.shape

In [None]:
X

In [None]:
#Train - Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
se = StandardScaler()
X_train = se.fit_transform(X_train)
X_test_scaled  = se.transform(X_test)

In [None]:
from sklearn.metrics import r2_score,mean_absolute_error,root_mean_squared_error
def evaluate(model,X_train,y_train,X_test,y_test): #Model -> LinarR,Knn,DT,RF
  train_pred = model.predict(X_train)
  test_pred = model.predict(X_test)

  print(f"Accuracy for your {model} is: ")

  #R2 Score
  print(f'Training R2 score for {model} is {r2_score(y_train,train_pred)}')
  print(f'Testing R2 score for {model} is {r2_score(y_test,test_pred)}')

  #MAE
  print(f'Training MAE for {model} is {mean_absolute_error(y_train,train_pred)}')
  print(f'Testing MAE for {model} is {mean_absolute_error(y_test,test_pred)}')

  #RMSE
  print(f'Trainig RMSE for {model} is {root_mean_squared_error(y_train,train_pred)}')
  print(f'Testing RMSE for {model} is {root_mean_squared_error(y_test,test_pred)}')

In [None]:
from sklearn.linear_model import LinearRegression,Ridge
LinearRegression_model = LinearRegression()

In [None]:
LinearRegression_model.fit(X_train,y_train)

In [None]:
evaluate(LinearRegression_model,X_train,y_train,X_test,y_test)

In [None]:
y_pred = LinearRegression_model.predict(X_test)

In [None]:
y_pred_train = LinearRegression_model.predict(X_train)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred) #Testing data accuracy (-)
r2_score(y_train,y_pred_train) #Training data accuracy(+)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
ridge = Ridge()

In [None]:
ridge_param = {
    "alpha": [0.1,1,10,50,100]
}

In [None]:
ridge_search = GridSearchCV(ridge,ridge_param,cv=5)

In [None]:
ridge_search.fit(X_train,y_train)

In [None]:
best_ridge = ridge_search.best_estimator_

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()

In [None]:
# knn ={
#     'n_neighbors': [1,2,3,4,5,6,7,8,9,10]
# }


In [None]:
knn_param = {
    'n_neighbors': [1,2,3,4,5,6,7,8,9,10]
}
knn = KNeighborsRegressor() # Re-initialize knn as a KNeighborsRegressor
knn = GridSearchCV(knn,knn_param,cv=5)

In [None]:
knn.fit(X_train,y_train)

In [None]:
bes_knn = knn.best_estimator_

In [None]:
evaluate(bes_knn,X_train,y_train,X_test,y_test)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()

In [None]:
tre_search = {
    'max_depth': [3,5,10],
    "min_samples_split": [2,5,10]
}


In [None]:
tre_search = GridSearchCV(dt,tre_search,cv=5)

In [None]:
tre_search.fit(X_train,y_train)

In [None]:
best_tre = tre_search.best_estimator_

In [None]:
best_tre

In [None]:
pd.DataFrame{
    'feature' = x.columns,
    'importance' = best_tre.feature_importances_
}.sort_values('importance',ascending=False)

In [None]:
from sklearn.ensemble import RandomForestRegressor
le = RandomForestRegressor()

In [None]:
RF_param = {
    'n_estimators': [100,200,300],
    "max_depth" : [3,5,10],
    "max_features" : ["log2","sqrt"],

}

In [None]:
RF_search = GridSearchCV(le,RF_param,cv=5)


In [None]:
RF_search.fit(X_train,y_train)

In [None]:
best_RF = RF_search.best_estimator_