In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
import time

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Getting Data and EDA

In [2]:
data = pd.read_csv("../input/data-science-job-salaries/ds_salaries.csv")
data.head()

First look:
- **Unnamed:0**: useless colum we can drop it
- **work_year**: this column represents the year which the salary paid in. Every following year salary values are increased according to one before year. To prevent confusion, I think we can consider these values as *ordinal* categorical values.
- **experience_level**: represents experience level of employeers. Probably it has high correlation with salary value.
- **job title**: I am not sure whether any difference they have if other features are the same. We will check it.
- **salary**: target value
- **salary_currency**: these categorical values have stable weight that impact to salary values directly because of the parity rates. Apart from that we have salary_in_use column, getting up-to-date rates and converting salaries with new ones would be great to consider current value based on the one currency.
- **salary_in_usd**: represents salary values in USD
- **employee_residence**: where employees live in
- **remote_ratio**: whether employees work from home. They may be considered as nominal.
- **company_location**: where the companies located in
- **company_size**: these may be considered as ordinal categorical values.

In [3]:
# There is no null values among the 607 samples
data.info()

In [4]:
# Descriptive statistics of numerical values
data.describe().T

In [5]:
# Dropping first column from the data
data.drop(["Unnamed: 0"], axis=1, inplace=True)
data.columns

In [6]:
data.head()

# Data Visualization

In [7]:
# Creating cloud of most repetitive words in the job titles
text = " ".join(title for title in data.job_title)

# Creating word_cloud with text as argument in .generate() method
word_cloud = WordCloud(width=3600,
                       height=600,
                       collocations = False, 
                       background_color = 'white',).generate(text)

# Display the generated Word Cloud
plt.figure(figsize=(21,6))
plt.imshow(word_cloud, 
           interpolation='bilinear',
           alpha=0.9)

plt.axis("off")
plt.show()

In [8]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec


def format_axes(fig):
    for i, ax in enumerate(fig.axes):
        ax.tick_params(labelbottom=True, labelleft=True)

fig = plt.figure(constrained_layout=True, figsize=(22,15))

gs = GridSpec(4, 3, figure=fig)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])
ax3 = fig.add_subplot(gs[0, 2])
ax4 = fig.add_subplot(gs[1, 2])
ax5 = fig.add_subplot(gs[2, 2])
ax6 = fig.add_subplot(gs[1:3, 0:2])
ax7 = fig.add_subplot(gs[3, 0])
ax8 = fig.add_subplot(gs[3, 1])
ax9 = fig.add_subplot(gs[3, 2])

sns.countplot(ax=ax1, data=data, x=data.work_year)
sns.countplot(ax=ax2, data=data, x=data.experience_level)
sns.countplot(ax=ax3, data=data, x=data.employment_type)
sns.histplot(ax=ax4,x=data.salary_in_usd, 
             bins=25, kde=True, color= "green")
sns.barplot(ax=ax5,x=data.employee_residence.value_counts()[:10]
            .append(pd.Series(data.employee_residence.value_counts()[10:].sum(), 
                              index=["Others"])).sort_values(ascending=False).index,
             y=data.employee_residence.value_counts()[:10]
            .append(pd.Series(data.employee_residence.value_counts()[10:].sum(), 
                              index=["Others"])).sort_values(ascending=False),  
              color=sns.color_palette()[1],).set_xlabel("Employee Residence")

sns.countplot(ax=ax6, data=data, y=data.job_title).tick_params(axis='y', which='minor', pad=-750, rotation=-360)
sns.countplot(ax=ax7, data=data, x=data.company_size)
sns.countplot(ax=ax8, data=data, x=data.remote_ratio).set_xticklabels(["On Site", "Hybrid", "Remote"])

sns.barplot(ax=ax9,x=data.company_location.value_counts()[:10]
            .append(pd.Series(data.company_location.value_counts()[10:].sum(), 
                              index=["Others"])).sort_values(ascending=False).index,
             y=data.company_location.value_counts()[:10]
            .append(pd.Series(data.company_location.value_counts()[10:].sum(), 
                              index=["Others"])).sort_values(ascending=False),  
              color= sns.color_palette()[0],).set_xlabel("Company Location")



fig.suptitle("Visual Representation of Feaures")
format_axes(fig)

plt.show()

# Detecting Outliers

In [9]:
# Outlier Detection

def find_outliers(data):
    global q_list
    q_list = []
    sorted_data = data.sort_values()
    
    for q, p in {"Q1": 25, "Q2": 50, "Q3": 75}.items():
        
        # Calculate Q1, Q2, Q3 and IQR.
        Q = np.percentile(sorted_data, p, interpolation = 'midpoint')
        q_list.append(Q)
        
        print("Checking...", q)
        time.sleep(2) 
        print("{}: {} percentile of the {} values is,".format(q,p,data.name), Q)
    
    global Q1, Q2, Q3
    
    Q1 = q_list[0]
    Q2 = q_list[1]
    Q3 = q_list[2]
    
    IQR = Q3 - Q1 
    print("Interquartile range is", IQR)
    
    # Find the lower and upper limits as Q1 – 1.5 IQR and Q3 + 1.5 IQR, respectively
    global low_lim, up_lim
    
    low_lim = Q1 - 1.5 * IQR
    up_lim = Q3 + 1.5 * IQR
    
    time.sleep(1)
    print(" ")
    print("Checking limits")
    time.sleep(2)
    print("low_limit is", low_lim)
    print("up_limit is", up_lim)
    
    
    time.sleep(1)
    # Find outliers in the dataset
    outliers =[]
    for x in sorted_data:
        if ((x> up_lim) or (x<low_lim)):
             outliers.append(x)
    print("\nOutliers are being added to list. Please wait!")
    time.sleep(3)
    print("\nOutliers in the dataset is", outliers)


find_outliers(data.salary_in_usd)

In [10]:
# Visualizing Outliers

def plot_outliers(data):
    # Paint red outlier areas on the boxplot
    f, ax = plt.subplots(figsize=(22,5))
    ax.ticklabel_format(style='plain', axis='both')
    
    outliers = sns.boxplot(ax=ax, x=data, palette="Paired")
    plt.axvspan(xmin = low_lim, xmax = data.min(), alpha=0.3, color='red')
    plt.axvspan(xmin = up_lim, xmax = data.max(), alpha=0.3, color='red')
    
plot_outliers(data.salary_in_usd)

In [11]:
# Dropping outliers from data

clean_data = data[(data.salary_in_usd < up_lim) & (data.salary_in_usd > low_lim)]
print("Minimum salary in USD: {}".format(clean_data.salary_in_usd.min()))
print("Maximum salary in USD: {}".format(clean_data.salary_in_usd.max()))

In [12]:
clean_data.head()

## Data Preprocessing

In [13]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

ohe = OneHotEncoder()

asd = ohe.fit_transform(clean_data[["work_year"]])

In [14]:
print(asd.toarray())