# Prepare Data

As we analysed in Comp_EDA.ipynb, build a Machine Learning model based on the types and Country Group.

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import math

In [2]:
# Matplotlib Config
sns.set()
plt.rcParams["figure.figsize"] = (14,8)
plt.style.use('fivethirtyeight')

In [3]:
df_result = pd.read_csv('../data/2019/survey_results_public.csv')
print(df_result.shape)
df_result.head()

(88883, 85)


Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
0,1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
1,2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
3,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
4,5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy


In [5]:
# Load Country Group Dictionary
with open('../data/2019/country_group_dict.json', 'r') as f:
    country_group_dict = json.load(f)

country_group_dict

{'United States': 'US',
 'Switzerland': 'Group A',
 'Israel': 'Group A',
 'Denmark': 'Group A',
 'Norway': 'Group A',
 'Australia': 'Group A',
 'Ireland': 'Group A',
 'Canada': 'Group A',
 'United Kingdom': 'Group A',
 'New Zealand': 'Group B',
 'Singapore': 'Group B',
 'Germany': 'Group B',
 'Netherlands': 'Group B',
 'Hong Kong (S.A.R.)': 'Group B',
 'Sweden': 'Group B',
 'Japan': 'Group B',
 'Finland': 'Group B',
 'United Arab Emirates': 'Group B',
 'Belgium': 'Group B',
 'Austria': 'Group B',
 'France': 'Group C',
 'South Korea': 'Group C',
 'South Africa': 'Group C',
 'Thailand': 'Group C',
 'Spain': 'Group C',
 'Estonia': 'Group C',
 'Latvia': 'Group C',
 'Uruguay': 'Group C',
 'Slovenia': 'Group C',
 'Italy': 'Group C',
 'Czech Republic': 'Group C',
 'Lithuania': 'Group C',
 'Poland': 'Group C',
 'Romania': 'Group C',
 'Slovakia': 'Group C',
 'Chile': 'Group C',
 'Taiwan': 'Group C',
 'Portugal': 'Group C',
 'Bulgaria': 'Group D',
 'China': 'Group D',
 'Hungary': 'Group D',
 'Cr

In [104]:
# Clean data as analysed in EDA
def clean_data(df):
    """Drop records, fill Age and Years"""
    # Drop rows with missing values in Country or ConvertedComp
    df_new = df.dropna(subset=['Country', 'ConvertedComp'], axis=0, how='any')
    # Drop rows with missing values in YearsCode and YearsCodePro
    df_new = df_new.dropna(subset=['YearsCode', 'YearsCodePro'], axis=0, how='all')
    
    age_mean = df[['Country', 'Age']].dropna().groupby(by='Country').mean().round(decimals=1)
    age_mean.reset_index(inplace=True)
    
    for idx, row in df.iterrows():
        if math.isnan(row['Age']):
            mean = age_mean.loc[age_mean['Country'] == row['Country']]['Age'].values
            if mean.size > 0:
                df_new.loc[df_new.index == idx, 'Age'] = mean[0]

    df_new['YearsCodePro'].replace('Less than 1 year', '0.5', inplace=True)
    df_new['YearsCodePro'].replace('More than 50 years', '60', inplace=True)
    df_new['YearsCodePro'].astype(float)

    df_new['YearsCode'].replace('Less than 1 year', '0.5', inplace=True)
    df_new['YearsCode'].replace('More than 50 years', '60', inplace=True)
    df_new['YearsCode'].astype(float)
    
    return df_new

In [105]:
df_cleaned = clean_data(df_result)


In [106]:
df_cleaned['Age'].isnull().sum()

0

In [None]:
def trim_data(df, max_comp, max_comp_us, min_comp, min_age, countries):
    """Return a cleaned DataFrame for the given DataFrame using the given max/min for Comps and Age"""
    
    
    
    # Drop rows out of the given range
    df_new = df.loc[(df['ConvertedComp'] >= min_comp) &
                        (((df['Country'] == "United States") &
                          (df['ConvertedComp'] <= max_comp_us)) |
                         ((df['Country'] != "United States") &
                          (df['ConvertedComp'] <= max_comp))) &
                        (df['Age'] > min_age)
                       ].sort_values(by='ConvertedComp', ascending=False)
    
    # Convert YearsCodePro to float
    df_trimmed['YearsCodePro'] = df_trimmed['YearsCodePro'].str.replace('Less than 1 year', '0.5')
    df_trimmed['YearsCodePro'] = df_trimmed['YearsCodePro'].str.replace('More than 50 years', '60')
    df_trimmed['YearsCodePro'] = df_trimmed['YearsCodePro'].astype(float)
    
    # Filter out countries
    if len(countries) > 0:
        df_trimmed = df_trimmed.loc[df_trimmed['Country'].isin(countries)]
    
    return df_trimmed


df_result_trimmed = trim_data(df_result, 250000, 500000, 1000, 10, country_list_all)
df_result_trimmed['CountryGroup'] = df_result_trimmed['Country'].apply(lambda x: country_group_dict[x])

In [None]:
new_df = df_result_trimmed[['Country', 'Country', 'CountryGroup', 'ConvertedComp', 'Age', 'YearsCodePro', 'DevType', 'LanguageWorkedWith', 'DatabaseWorkedWith', 'PlatformWorkedWith', 'WebFrameWorkedWith', 'MiscTechWorkedWith']]
