In [1]:
# supervised machine learning regression scenario

# Predicting Salary of Software Developers

## Data Collection

In [2]:
# Bringing in some general use case libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Read the data using pandas and make a copy
df = pd.read_csv('../datasets/full/2020/survey_results_public.csv', na_values='?', comment = '\t', sep=',', skipinitialspace=True)

data = df.copy()

In [7]:
# Confirming the data read was successful
data.sample(20)

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
34154,34279,I am a developer by profession,No,34.0,19.0,Yearly,31000.0,40070.0,United Kingdom,Pound sterling,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",,Django;jQuery,Just as welcome now as I felt last year,30.0,5.0,3.0
50007,51037,I am a developer by profession,Yes,33.0,15.0,Monthly,550000.0,18108.0,Nigeria,Nigerian naira,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET;ASP.NET Core;React.js;Vue.js,Angular;Angular.js;ASP.NET;ASP.NET Core;Flask;...,Just as welcome now as I felt last year,50.0,15.0,8.0
6288,6314,I am a developer by profession,Yes,24.0,17.0,Monthly,,,Philippines,Philippine peso,...,Neither easy nor difficult,Appropriate in length,No,"Information systems, information technology, o...",Angular;Angular.js;ASP.NET;ASP.NET Core;Django...,jQuery;Laravel;Ruby on Rails;Symfony;Vue.js,A lot more welcome now than last year,40.0,5.0,1.0
26275,26372,I am a student who is learning to code,Yes,,23.0,,,,Kenya,,...,Neither easy nor difficult,Appropriate in length,No,,Angular;Angular.js;Django;Flask;Laravel;React....,,,,2.0,
23779,23869,I am a developer by profession,Yes,33.0,14.0,Yearly,100000.0,100000.0,United States,United States dollar,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",React.js,React.js;Spring,Just as welcome now as I felt last year,40.0,12.0,8.0
881,883,I am a developer by profession,Yes,35.0,30.0,Yearly,82000.0,82000.0,United States,United States dollar,...,Easy,Too long,No,"Computer science, computer engineering, or sof...",Flask,Flask,Just as welcome now as I felt last year,40.0,5.0,1.0
10409,10442,I am a developer by profession,Yes,34.0,21.0,Monthly,15000.0,29580.0,Turkey,Turkish lira,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;jQuery,Django;jQuery,Just as welcome now as I felt last year,45.0,14.0,10.0
2514,2526,I am a developer by profession,Yes,22.0,17.0,Yearly,64000.0,64000.0,United States,United States dollar,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",React.js;Vue.js,jQuery;React.js;Vue.js,Just as welcome now as I felt last year,40.0,5.0,2.0
4784,4800,I am a developer by profession,Yes,35.0,15.0,Monthly,4700.0,60972.0,Germany,European Euro,...,Neither easy nor difficult,Appropriate in length,No,Fine arts or performing arts (such as graphic ...,Angular;Django;Flask,Angular;Django;Flask,Just as welcome now as I felt last year,40.0,7.0,4.0
14724,14770,I am a developer by profession,Yes,50.0,13.0,Monthly,10000.0,27492.0,Brazil,Brazilian real,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET Core,A lot more welcome now than last year,10.0,36.0,35.0


### Problem Statement:
The data contains ConvertedComp data which is continous data and tells us the total compensation of the developer.

The aim is to **predict the total compensation for a developer** given we have the feature variables for a potential candidate.

In [8]:
# Looking at information about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64461 entries, 0 to 64460
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Respondent                    64461 non-null  int64  
 1   MainBranch                    64162 non-null  object 
 2   Hobbyist                      64416 non-null  object 
 3   Age                           45446 non-null  float64
 4   Age1stCode                    57900 non-null  object 
 5   CompFreq                      40069 non-null  object 
 6   CompTotal                     34826 non-null  float64
 7   ConvertedComp                 34756 non-null  float64
 8   Country                       64072 non-null  object 
 9   CurrencyDesc                  45472 non-null  object 
 10  CurrencySymbol                45468 non-null  object 
 11  DatabaseDesireNextYear        44067 non-null  object 
 12  DatabaseWorkedWith            49534 non-null  object 
 13  D

In [16]:
# Data set contains 60 columns, we only want to focus on some of those
data.drop(['Respondent', 'MainBranch', 'Hobbyist', 'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith', 'DevType', 'Employment', 'Ethnicity', 'Gender', 'JobFactors', 'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith', 'MiscTechDesireNextYear', 'MiscTechWorkedWith', 'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps'], axis=1, inplace=True)

In [17]:
# Looking at information about the data after dropping some columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64461 entries, 0 to 64460
Data columns (total 40 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     45446 non-null  float64
 1   Age1stCode              57900 non-null  object 
 2   CompFreq                40069 non-null  object 
 3   CompTotal               34826 non-null  float64
 4   ConvertedComp           34756 non-null  float64
 5   Country                 64072 non-null  object 
 6   CurrencyDesc            45472 non-null  object 
 7   EdLevel                 57428 non-null  object 
 8   NEWDevOpsImpt           41728 non-null  object 
 9   NEWEdImpt               48461 non-null  object 
 10  NEWJobHunt              42282 non-null  object 
 11  NEWJobHuntResearch      41018 non-null  object 
 12  NEWLearn                56152 non-null  object 
 13  NEWOffTopic             50801 non-null  object 
 14  NEWOnboardGood          42619 non-null

In [18]:
# Data set contains 60 columns, we only want to focus on some of those...removing more
data.drop(['NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch', 'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms', 'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites', 'NEWStuck', 'OpSys', 'Sexuality', 'SOAccount', 'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength', 'Trans', 'UndergradMajor', 'WebframeDesireNextYear', 'WebframeWorkedWith', 'WelcomeChange', 'WorkWeekHrs'], axis=1, inplace=True)

In [19]:
# Looking at information about the data after dropping more columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64461 entries, 0 to 64460
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     45446 non-null  float64
 1   Age1stCode              57900 non-null  object 
 2   CompFreq                40069 non-null  object 
 3   CompTotal               34826 non-null  float64
 4   ConvertedComp           34756 non-null  float64
 5   Country                 64072 non-null  object 
 6   CurrencyDesc            45472 non-null  object 
 7   EdLevel                 57428 non-null  object 
 8   OrgSize                 44331 non-null  object 
 9   PlatformDesireNextYear  50601 non-null  object 
 10  PlatformWorkedWith      53840 non-null  object 
 11  PurchaseWhat            39360 non-null  object 
 12  YearsCode               57680 non-null  object 
dtypes: float64(3), object(10)
memory usage: 6.4+ MB


In [20]:
# Data set contains 60 columns, we only want to focus on some of those...removing more
data.drop(['OrgSize', 'PlatformDesireNextYear', 'PlatformWorkedWith', 'PurchaseWhat'], axis=1, inplace=True)

In [None]:
data.info()
