# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Read File

In [2]:
df = pd.read_csv("Salary_Dataset_with_Extra_Features.csv")
#df2 = pd.read_csv("Software_Professional_Salaries.csv")

In [3]:
df.head()
#df2.head()

Unnamed: 0,Rating,Company Name,Job Title,Salary,Salaries Reported,Location,Employment Status,Job Roles
0,3.8,Sasken,Android Developer,400000,3,Bangalore,Full Time,Android
1,4.5,Advanced Millennium Technologies,Android Developer,400000,3,Bangalore,Full Time,Android
2,4.0,Unacademy,Android Developer,1000000,3,Bangalore,Full Time,Android
3,3.8,SnapBizz Cloudtech,Android Developer,300000,3,Bangalore,Full Time,Android
4,4.4,Appoids Tech Solutions,Android Developer,600000,3,Bangalore,Full Time,Android


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22770 entries, 0 to 22769
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Rating             22770 non-null  float64
 1   Company Name       22770 non-null  object 
 2   Job Title          22770 non-null  object 
 3   Salary             22770 non-null  int64  
 4   Salaries Reported  22770 non-null  int64  
 5   Location           22770 non-null  object 
 6   Employment Status  22770 non-null  object 
 7   Job Roles          22770 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 1.4+ MB


In [5]:
df.describe()

Unnamed: 0,Rating,Salary,Salaries Reported
count,22770.0,22770.0,22770.0
mean,3.918213,695387.2,1.855775
std,0.519675,884399.0,6.823668
min,1.0,2112.0,1.0
25%,3.7,300000.0,1.0
50%,3.9,500000.0,1.0
75%,4.2,900000.0,1.0
max,5.0,90000000.0,361.0


In [6]:
df.dtypes
#All data types are valid

Rating               float64
Company Name          object
Job Title             object
Salary                 int64
Salaries Reported      int64
Location              object
Employment Status     object
Job Roles             object
dtype: object

In [7]:
df.duplicated()
# No Duplicated Data

0        False
1        False
2        False
3        False
4        False
         ...  
22765    False
22766    False
22767    False
22768    False
22769    False
Length: 22770, dtype: bool

In [8]:
df.isna().sum()
# No Missing Data

Rating               0
Company Name         0
Job Title            0
Salary               0
Salaries Reported    0
Location             0
Employment Status    0
Job Roles            0
dtype: int64

In [9]:
df["Job Title"].value_counts()

Software Development Engineer          2351
Android Developer                      2029
Software Development Engineer (SDE)    1614
Front End Developer                    1412
Test Engineer                          1314
                                       ... 
Java Andriod Developer                    1
Java Deceloper                            1
Java/J2EE Programmer                      1
Java SOA Developer                        1
Associate Web Developer                   1
Name: Job Title, Length: 1080, dtype: int64

In [10]:
df["Job Title"].nunique()

1080

In [11]:
df["Job Title"].str.contains("Developer").sum()

10767

In [12]:
df["Job Title"].str.contains("Engineer").sum()

10796

In [13]:
df["Job Title"].str.contains("Programmer").sum()

77

In [14]:
job_categories = ["Developer", "Engineer", "Programmer"]

In [15]:
conditions = [
    (df["Job Title"].str.contains("Developer")),
    (df["Job Title"].str.contains("Engineer")),
    df["Job Title"].str.contains("Programmer")
]

In [16]:
df["job_category"] = np.select(conditions, job_categories, default="Other")

In [17]:
df[["Job Title", "job_category"]].head()

Unnamed: 0,Job Title,job_category
0,Android Developer,Developer
1,Android Developer,Developer
2,Android Developer,Developer
3,Android Developer,Developer
4,Android Developer,Developer


In [18]:
df.drop("Job Title", axis=1, inplace=True)

In [19]:
df["Salaries Reported"].value_counts()

1      18206
2       2401
3        789
4        382
5        228
       ...  
39         1
162        1
361        1
135        1
100        1
Name: Salaries Reported, Length: 82, dtype: int64

In [20]:
df.drop("Salaries Reported", axis=1, inplace=True)

In [21]:
df["Employment Status"].value_counts()

Full Time     20083
Intern         2106
Contractor      548
Trainee          33
Name: Employment Status, dtype: int64

In [58]:
df["Job Roles"].value_counts()

SDE         7485
Android     2845
Frontend    2070
Java        1800
Testing     1719
IOS         1551
Backend     1085
Web          989
Python       935
Database     760
Mobile       220
Name: Job Roles, dtype: int64

In [23]:
# 75th percentile
seventy_fifth = df["Salary"].quantile(0.75)

# 25th percentile
twenty_fifth = df["Salary"].quantile(0.25)

# Interquartile range 
salaries_iqr = seventy_fifth - twenty_fifth
print(salaries_iqr)

600000.0


In [24]:
# Upper
upper = seventy_fifth + (salaries_iqr * 1.5)

lower = twenty_fifth - (salaries_iqr * 1.5)

print(upper)

1800000.0


In [25]:
df[(df["Salary"] > upper)]

Unnamed: 0,Rating,Company Name,Salary,Location,Employment Status,Job Roles,job_category
96,4.1,Dunzo,2800000,Bangalore,Full Time,Android,Developer
106,4.0,Walmart Global Tech,2300000,Bangalore,Full Time,Android,Developer
145,4.1,First Student,2200000,Bangalore,Intern,Android,Developer
161,4.4,Microsoft,2100000,Bangalore,Full Time,Android,Developer
162,3.7,Paytm Money,1900000,Bangalore,Full Time,Android,Developer
...,...,...,...,...,...,...,...
22241,3.9,Western Digital,2400000,Bangalore,Full Time,Web,Developer
22271,4.1,HealthGraph India,4400000,Bangalore,Full Time,Web,Developer
22274,2.9,Sud Express,2900000,Bangalore,Full Time,Web,Developer
22366,4.2,Greytip,2100000,Bangalore,Full Time,Web,Developer


In [26]:
df = df[(df["Salary"] > lower) & (df["Salary"] < upper)]

In [27]:
px.box(df,
       y='Salary',
       template='ggplot2',
       title='<b>Salaries')

In [28]:
fig=px.imshow(df.corr(),
              text_auto=True,
              height=600,
              width=600,
              template='ggplot2',
              aspect='auto',
              title='<b>Correlation of columns</b>')
fig.update_layout(title_x=0.5)
fig.show()

In [29]:
df.dtypes

Rating               float64
Company Name          object
Salary                 int64
Location              object
Employment Status     object
Job Roles             object
job_category          object
dtype: object

In [45]:
rl = df.groupby('Location', as_index=False)['Rating'].mean().sort_values(by='Rating', ascending=False)

fig = px.bar(rl,
             x = 'Location',
             y = 'Rating',
             color = 'Location',
             template = 'ggplot2',
             title = '<b> Rating of Location')
fig.update_layout(title_x=0.5)
fig.show()

In [46]:
re = df.groupby('Employment Status', as_index=False)['Rating'].mean().sort_values(by='Rating', ascending=False)

fig = px.bar(re,
             x = 'Employment Status',
             y = 'Rating',
             color = 'Employment Status',
             template = 'ggplot2',
             title = '<b> Rating of Employment Statuse')
fig.update_layout(title_x=0.5)
fig.show()

In [32]:
df["Company Name"].value_counts()

Tata Consultancy Services         268
Infosys                           168
Amazon                            148
Accenture                         146
Cognizant Technology Solutions    139
                                 ... 
Project42 Labs                      1
Transfin.                           1
ML Books International              1
Infallible HR                       1
Nextgen Innovation Labs             1
Name: Company Name, Length: 10953, dtype: int64

In [34]:
z = df.groupby('Company Name', as_index=False)['Salary'].mean().sort_values(by='Salary', ascending=False)

fig = px.bar(z.head(10),
           x = 'Company Name',
           y = 'Salary',
           color = 'Company Name',
           text = 'Salary',
           template = 'seaborn',
           title = '<b> Salary of Company Name')
fig.update_traces(textfont_size=8)
fig.show()

In [35]:
df.groupby('Employment Status')['Salary'].mean()

Employment Status
Contractor    518950.848369
Full Time     603959.169129
Intern        339560.730751
Trainee       324303.030303
Name: Salary, dtype: float64

In [57]:
se = df.groupby('Employment Status',as_index=False)['Salary'].mean().sort_values(by='Salary',ascending=False)
se['Salary'] = round(se['Salary'],0)

fig=px.pie(se,
           names = 'Employment Status',
           values = 'Salary',
           color = 'Employment Status',
           hole = 0.7,
           template = 'ggplot2',
           title = '<b>Salary of Employment Status')

fig.update_layout(title_x=0.5, legend=dict(orientation='h',
                                          yanchor='bottom',
                                          y=1.02,
                                          xanchor='right',
                                          x=1))

In [56]:
sj = df.groupby('job_category',as_index=False)['Salary'].mean().sort_values(by='Salary',ascending=False)
sj['Salary'] = round(sj['Salary'],0)

fig=px.pie(sj,
           names = 'job_category',
           values = 'Salary',
           color='job_category',
           hole = 0.7,
           labels = {'job_category':'Job Category  ','Salary':'Salary'},
           template = 'ggplot2',
           title = '<b>Salary of Job Category')

fig.update_layout(title_x=0.5, legend=dict(orientation='h',
                                          yanchor='bottom',
                                          y=1.02,
                                          xanchor='right',
                                          x=1))

In [60]:
sjr = df.groupby('Job Roles',as_index=False)['Salary'].mean().sort_values(by='Salary',ascending=False)
sjr['Salary'] = round(sjr['Salary'],0)

fig=px.pie(sjr,
           names = 'Job Roles',
           values = 'Salary',
           color = 'Job Roles',
           hole = 0.7,
           template = 'ggplot2',
           title = '<b>Salary of Job Category')

fig.update_layout(title_x=0.5, legend=dict(orientation='h',
                                          yanchor='bottom',
                                          y=1.02,
                                          xanchor='right',
                                          x=1))