In [1]:
# import important libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# import dataset from external storage
df = pd.read_csv('survey_results_public.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'survey_results_public.csv'

In [None]:
# check shape of dataframe
df.shape

In [None]:
# select only important columns for better prediction
df2 = df[['Employment', 'Country', 'EdLevel', 'YearsCodePro', 'ConvertedCompYearly']]
df2.rename({'ConvertedCompYearly':'Salary', 'YearsCodePro':'YearOfExperience', 'EdLevel':'HighestEducation'}, axis = 1, inplace=True)
df2.head()

# Data Cleaning

In [None]:
# removeing null values from salary column
df2 = df2[df2['Salary'].notnull()]
df2.head()

In [None]:
# find null values in dataframe
df2.isnull().sum()

In [None]:
# remove all null values from dataframe
df2.dropna(inplace=True)

In [None]:
# check null values after cleaning
df2.isnull().sum()

In [None]:
# only select those data where employment is full time
df2 = df2[df2['Employment']== 'Employed full-time']
df2.head()

In [None]:
# new data frame shape
df2.shape

In [None]:
# check values in country column
country = df2['Country'].value_counts()
country

In [None]:
# country whose count is below then 400 asign them into new variable
lessThen400 = country[country<400]
lessThen400

In [None]:
# country whose count is below then 400, convert them into others
df2['Country'] = df2['Country'].apply(lambda x: 'Others' if x in lessThen400 else x)

In [None]:
# country values count after converting
df2.Country.value_counts()

In [None]:
# remove those data where country is others
df2 = df2[df2['Country'] != 'Others']
df2

In [None]:
# country values counts after remove others data
df2['Country'].value_counts()

**Some Country Name Is Very Big So First We Have To Replace Them With Small Name**

In [None]:
# rename countries whose name is very big
df2.Country.replace('United Kingdom of Great Britain and Northern Ireland', 'Uk',inplace=True )
df2.Country.replace('United States of America', 'America',inplace=True )
df2.Country.replace('Russian Federation', 'Russia',inplace=True )

In [None]:
# check unique values in country column
df2.Country.unique()

In [None]:
# check values count of highest education column
df2.HighestEducation.value_counts()

In [None]:
# create a function to replace highest education column data
def change_education(x):
    if x == 'Bachelor’s degree (B.A., B.S., B.Eng., etc.)':
        return "Bachelor's Degree"
    elif x == 'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)':
        return "Master’s Degree"
    elif x == 'Primary/elementary school' or x == 'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)' or x == 'Some college/university study without earning a degree' or x == 'Other doctoral degree (Ph.D., Ed.D., etc.)' or x == 'Associate degree (A.A., A.S., etc.)' or x == 'Professional degree (JD, MD, etc.)' or x == 'Something else' :
        return "Non It Degree"
    return x

In [None]:
# apply function on highest education column
df2['HighestEducation'] = df2['HighestEducation'].apply(change_education)

In [None]:
# check value counts of highest education column after replace value
df2.HighestEducation.value_counts()

In [None]:
# check unique value in year of experience column
df2['YearOfExperience'].unique()

**There Are Some Wrong Datas, We Have To Replace Them With Right Data**

In [None]:
# create a function to replace value of year of experience column
def change_year(x):
    if x == 'Less than 1 year':
        return 0.5
    elif x == 'More than 50 years':
        return 50
    return float(x)

In [None]:
# apply function
df2['YearOfExperience'] = df2['YearOfExperience'].apply(change_year)

In [None]:
# check unique value in year of experience column after replace
df2['YearOfExperience'].unique()

# Find And Remove Outliers

In [None]:
# check outliers by boxplot
plt.figure(figsize=(18,7))
plt.subplot(2,2,1)
sns.boxplot(df2['YearOfExperience'])
plt.subplot(2,2,2)
sns.boxplot(df2['Salary'])
plt.show()

In [None]:
# find outliers based on country
plt.figure(figsize = (18,7))
plt.subplot(2,1,1)
sns.boxplot(df2['Country'], df2['YearOfExperience'])
plt.subplot(2,1,2)
sns.boxplot(df2['Country'], df2['Salary'])
plt.show()

**We Can See There Are Different Different Outliers Based On Country, So We Have To Clean Them One By One**

**Find And Remove Outliers From Year Of Experience And Salary Column**

In [None]:
sweden = df2[df2['Country'] == 'Sweden']
spain = df2[df2['Country'] == 'Spain']
germany = df2[df2['Country'] == 'Germany']
turkey = df2[df2['Country'] == 'Turkey']
canada = df2[df2['Country'] == 'Canada']

france = df2[df2['Country'] == 'France']
switzerland = df2[df2['Country'] == 'Switzerland']
uk = df2[df2['Country'] == 'Uk']
russia = df2[df2['Country'] == 'Russia']

israel = df2[df2['Country'] == 'Israel']
america = df2[df2['Country'] == 'America']
brazil = df2[df2['Country'] == 'Brazil']

italy = df2[df2['Country'] == 'Italy']
netherlands = df2[df2['Country'] == 'Netherlands']
poland = df2[df2['Country'] == 'Poland']
india = df2[df2['Country'] == 'India']
australia = df2[df2['Country'] == 'Australia']
norway = df2[df2['Country'] == 'Norway']

In [None]:
# India

low = india['Salary'].quantile(0.25)
high = india['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr

india['Salary'] = india['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = india['YearOfExperience'].quantile(0.25)
high = india['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr

india['YearOfExperience'] = india['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# America

low = america['Salary'].quantile(0.25)
high = america['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr

america['Salary'] = america['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = america['YearOfExperience'].quantile(0.25)
high = america['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr

america['YearOfExperience'] = america['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Russia

low = russia['Salary'].quantile(0.25)
high = russia['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr

russia['Salary'] = russia['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = russia['YearOfExperience'].quantile(0.25)
high = russia['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr

russia['YearOfExperience'] = russia['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Brazil

low = brazil['Salary'].quantile(0.25)
high = brazil['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr

brazil['Salary'] = brazil['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = brazil['YearOfExperience'].quantile(0.25)
high = brazil['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr

brazil['YearOfExperience'] = brazil['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Uk

low = uk['Salary'].quantile(0.25)
high = uk['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
uk['Salary'] = uk['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = uk['YearOfExperience'].quantile(0.25)
high = uk['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
uk['YearOfExperience'] = uk['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Israel

low = israel['Salary'].quantile(0.25)
high = israel['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
israel['Salary'] = israel['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = israel['YearOfExperience'].quantile(0.25)
high = israel['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
israel['YearOfExperience'] = israel['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# France

low = france['Salary'].quantile(0.25)
high = france['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
france['Salary'] = france['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = france['YearOfExperience'].quantile(0.25)
high = france['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
france['YearOfExperience'] = france['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Sweden

low = sweden['Salary'].quantile(0.25)
high = sweden['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
sweden['Salary'] = sweden['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)
sweden['Salary'] = sweden['Salary'].apply(lambda x: percentile_1 if x<percentile_1 else x)

low = sweden['YearOfExperience'].quantile(0.25)
high = sweden['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
sweden['YearOfExperience'] = sweden['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Spain

low = spain['Salary'].quantile(0.25)
high = spain['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
spain['Salary'] = spain['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = spain['YearOfExperience'].quantile(0.25)
high = spain['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
spain['YearOfExperience'] = spain['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Germany

low = germany['Salary'].quantile(0.25)
high = germany['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
germany['Salary'] = germany['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)
germany['Salary'] = germany['Salary'].apply(lambda x: percentile_1 if x<percentile_1 else x)

low = germany['YearOfExperience'].quantile(0.25)
high = germany['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
germany['YearOfExperience'] = germany['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Turkey

low = turkey['Salary'].quantile(0.25)
high = turkey['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
turkey['Salary'] = turkey['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = turkey['YearOfExperience'].quantile(0.25)
high = turkey['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
turkey['YearOfExperience'] = turkey['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Canada

low = canada['Salary'].quantile(0.25)
high = canada['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
canada['Salary'] = canada['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = canada['YearOfExperience'].quantile(0.25)
high = canada['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
canada['YearOfExperience'] = canada['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Switzerland

low = switzerland['Salary'].quantile(0.25)
high = switzerland['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
switzerland['Salary'] = switzerland['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)
switzerland['Salary'] = switzerland['Salary'].apply(lambda x: percentile_1 if x<percentile_1 else x)

low = switzerland['YearOfExperience'].quantile(0.25)
high = switzerland['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
switzerland['YearOfExperience'] = switzerland['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Italy

low = italy['Salary'].quantile(0.25)
high = italy['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
italy['Salary'] = italy['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = italy['YearOfExperience'].quantile(0.25)
high = italy['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
italy['YearOfExperience'] = italy['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Netherlands

low = netherlands['Salary'].quantile(0.25)
high = netherlands['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
netherlands['Salary'] = netherlands['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = netherlands['YearOfExperience'].quantile(0.25)
high = netherlands['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
netherlands['YearOfExperience'] = netherlands['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Poland

low = poland['Salary'].quantile(0.25)
high = poland['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
poland['Salary'] = poland['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = poland['YearOfExperience'].quantile(0.25)
high = poland['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
poland['YearOfExperience'] = poland['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Australia

low = australia['Salary'].quantile(0.25)
high = australia['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
australia['Salary'] = australia['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)

low = australia['YearOfExperience'].quantile(0.25)
high = australia['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
australia['YearOfExperience'] = australia['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)

# Norway

low = norway['Salary'].quantile(0.25)
high = norway['Salary'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
norway['Salary'] = norway['Salary'].apply(lambda x: percentile_3 if x>percentile_3 else x)
norway['Salary'] = norway['Salary'].apply(lambda x: percentile_1 if x<percentile_1 else x)

low = norway['YearOfExperience'].quantile(0.25)
high = norway['YearOfExperience'].quantile(0.75)
iqr = high-low

percentile_1 = low - 1.5 * iqr
percentile_3 = low + 1.5 * iqr
norway['YearOfExperience'] = norway['YearOfExperience'].apply(lambda x: percentile_3 if x>percentile_3 else x)


In [None]:
# concate all dataframes and create new dataframe
new_df = pd.concat([india,russia,italy,america,uk,brazil,norway,netherlands,poland,australia,israel,sweden,switzerland,france,canada,germany,turkey,spain],ignore_index=True)

In [None]:
new_df.head()

In [None]:
# new dataframe shape
new_df.shape

In [None]:
# check outliers in year of experience column after cleaning
plt.figure(figsize = (18,10))
sns.boxplot(new_df.Country,new_df.YearOfExperience)
plt.title('Year Of Experience Based On Country', fontsize = 25)
plt.xlabel('Country', fontsize = 20)
plt.xticks(rotation = 90, fontsize = 16)
plt.ylabel('Year Of Experience', fontsize = 20)
plt.yticks(fontsize = 16)
plt.show()

In [None]:
# check outliers in salary column after cleaning
plt.figure(figsize = (18,7))
sns.boxplot(new_df.Country,new_df.Salary)
plt.title('Salary Based On Country', fontsize = 25)
plt.xlabel('Country', fontsize = 20)
plt.xticks(rotation = 90, fontsize = 16)
plt.ylabel('Salary', fontsize = 20)
plt.yticks(fontsize = 16)
plt.show()

In [None]:
# drop employment column for better prediction
new_df.drop('Employment', axis = 1, inplace=True)

In [None]:
new_df.head()

In [None]:
dfx = new_df.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le_country = LabelEncoder()
dfx['Country'] = le_country.fit_transform(dfx['Country'])
dfx['Country'].unique()

In [None]:
le_education = LabelEncoder()
dfx['HighestEducation'] = le_education.fit_transform(dfx['HighestEducation'])
dfx['HighestEducation'].unique()

In [None]:
dfx.head()

# Model Building

In [None]:
# values assign in x and y
X = dfx.drop('Salary', axis = 1)
y = dfx['Salary']

In [None]:
# import train test split for split data
from sklearn.model_selection import train_test_split

In [None]:
# split train and test data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
# import decision tree regressor for prediction
from sklearn.tree import DecisionTreeRegressor

**Use Grid Search CV For Better Result**

In [None]:
# import grid search cv fro get best model
from sklearn.model_selection import GridSearchCV

In [None]:
# train our model by different different paramiters
params = {'max_depth':[1,2,3,4,5,6,7,8,9,10,20,30]}
dtr = DecisionTreeRegressor()
gsc = GridSearchCV(dtr,params,scoring='neg_mean_squared_error')
gsc.fit(X_train,y_train)

In [None]:
# check best score and parameter of our model
print('Best Parameters:', gsc.best_params_)
print('Best Scores:', gsc.best_score_)

In [None]:
# train our model with best estimator
dttr = gsc.best_estimator_
dttr.fit(X_train,y_train)

In [None]:
# get prediction value
y_pred = dttr.predict(X_test)

In [None]:
# check accuracy of our model
r2_score(y_test,y_pred)

In [None]:
# check performance and accuracy by distplot
sns.displot(y_pred-y_test,kind = 'kde')
plt.show()

In [None]:
# check accuracy and performance of model by scatter plot
sns.scatterplot(y_pred,y_test)
plt.show()