## Imports

In [125]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

## Gathering data

In [95]:
df = pd.read_csv('./developer_survey_2020/survey_results_public.csv')

## Assessing data

In [96]:
schema = pd.read_csv('./developer_survey_2020/survey_results_schema.csv')
schema

Unnamed: 0,Column,QuestionText
0,Respondent,Randomized respondent ID number (not in order ...
1,MainBranch,Which of the following options best describes ...
2,Hobbyist,Do you code as a hobby?
3,Age,What is your age (in years)? If you prefer not...
4,Age1stCode,At what age did you write your first line of c...
...,...,...
56,WebframeWorkedWith,Which web frameworks have you done extensive d...
57,WelcomeChange,"Compared to last year, how welcome do you feel..."
58,WorkWeekHrs,"On average, how many hours per week do you wor..."
59,YearsCode,"Including any education, how many years have y..."


In [97]:
df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq',
       'CompTotal', 'ConvertedComp', 'Country', 'CurrencyDesc',
       'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith',
       'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors',
       'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith',
       'MiscTechDesireNextYear', 'MiscTechWorkedWith',
       'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
       'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
       'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
       'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
       'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
       'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
       'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
       'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
  

## 1. Question: What are the differences between people who come from a STEM/ComSci background and those who did not?

In [98]:
# drop respondents who did not give the major of their eduction
df_1 = df.dropna(subset=['UndergradMajor'])
df_1.head()

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,...,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8
6,7,I am a developer by profession,Yes,,18,Monthly,,,India,United States dollar,...,,,,"Computer science, computer engineering, or sof...",,,A lot more welcome now than last year,,6,4


In [99]:
df_1['UndergradMajor'].unique().tolist()

['Computer science, computer engineering, or software engineering',
 'Mathematics or statistics',
 'Another engineering discipline (such as civil, electrical, mechanical, etc.)',
 'A humanities discipline (such as literature, history, philosophy, etc.)',
 'A health science (such as nursing, pharmacy, radiology, etc.)',
 'Information systems, information technology, or system administration',
 'Web development or web design',
 'A natural science (such as biology, chemistry, physics, etc.)',
 'Fine arts or performing arts (such as graphic design, music, studio art, etc.)',
 'I never declared a major',
 'A social science (such as anthropology, psychology, political science, etc.)',
 'A business discipline (such as accounting, finance, marketing, etc.)']

In [100]:
df_1.shape

(50995, 61)

In [101]:
# create conditional var whether someone has majored in STEM or not 
df_1.loc[:,'HasBackground'] = df.UndergradMajor.isin(['Computer science, computer engineering, or software engineering',
 'Mathematics or statistics',
 'Another engineering discipline (such as civil, electrical, mechanical, etc.)','Information systems, information technology, or system administration',
 'Web development or web design',
 'A natural science (such as biology, chemistry, physics, etc.)']).astype(int)
df_1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro,HasBackground
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,...,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27,1
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4,1
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,...,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4,1
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,...,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8,1
6,7,I am a developer by profession,Yes,,18,Monthly,,,India,United States dollar,...,,,"Computer science, computer engineering, or sof...",,,A lot more welcome now than last year,,6,4,1


In [102]:
# select fewer variables of interest
df_2 = df_1[['Respondent', 'MainBranch', 'Hobbyist', 'Age1stCode', 'EdLevel', 'Employment', 'JobSat', 'JobSeek', 'YearsCode', 'YearsCodePro', 'HasBackground']]
# make some of the conditional vars numerical, as I think it makes more sense that way
df_2.loc[df_2['YearsCode'] == 'Less than 1 year', 'YearsCode'] = 0
df_2.loc[df_2['YearsCode'] == 'More than 50 years', 'YearsCode'] = 51
df_2.loc[df_2['YearsCodePro'] == 'Less than 1 year', 'YearsCodePro'] = 0
df_2.loc[df_2['YearsCodePro'] == 'More than 50 years', 'YearsCodePro'] = 51
df_2.loc[df_2['Age1stCode'] == 'Younger than 5 years', 'Age1stCode'] = 4
df_2.loc[df_2['Age1stCode'] == 'Older than 85', 'Age1stCode'] = 86
df_2['YearsCode'] = df_2['YearsCode'].astype(float)
df_2['YearsCodePro'] = df_2['YearsCodePro'].astype(float)
df_2['Age1stCode'] = df_2['Age1stCode'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2['YearsCode'] = df_2['YearsCode'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2['YearsCodePro'] = df_2['YearsCodePro'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try usin

In [103]:
# Fill numeric columns with the mean
num_vars = df_2.select_dtypes(include=['float', 'int']).columns
for col in num_vars:
    df_2[col].fillna((df_2[col].mean()), inplace=True)
    
# Dummy the categorical variables
cat_vars = df_2.select_dtypes(include=['object']).copy().columns
for var in cat_vars:
    # for each cat add dummy var, drop original column
    df_2 = pd.concat([df_2.drop(var, axis=1), pd.get_dummies(df_2[var], prefix=var, prefix_sep='_')], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [104]:
df_2_STEM = df_2[df_2['HasBackground'] == 1]
df_2_STEM.shape

(46250, 33)

In [105]:
df_2_NOTSTEM = df_2[df_2['HasBackground'] == 0]
df_2_NOTSTEM.head()

Unnamed: 0,Respondent,Age1stCode,YearsCode,YearsCodePro,HasBackground,MainBranch_I am a developer by profession,MainBranch_I am a student who is learning to code,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",MainBranch_I code primarily as a hobby,"MainBranch_I used to be a developer by profession, but no longer am",...,Employment_Retired,Employment_Student,JobSat_Neither satisfied nor dissatisfied,JobSat_Slightly dissatisfied,JobSat_Slightly satisfied,JobSat_Very dissatisfied,JobSat_Very satisfied,JobSeek_I am actively looking for a job,JobSeek_I am not interested in new job opportunities,"JobSeek_I’m not actively looking, but I am open to new opportunities"
26,27,12.0,20.0,20.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
27,28,22.0,2.0,8.525492,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
47,48,30.0,8.0,8.0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
49,50,46.0,10.0,9.0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
50,51,10.0,23.0,7.0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [106]:
# find columns with biggest difference in mean between STEM and not-STEM
df_res_1 = pd.DataFrame([df_2_STEM.mean(), df_2_NOTSTEM.mean()]).transpose().rename(columns={0: 'STEMbackground', 1: 'nonSTEMbackground'})
df_res_1['difference'] = abs(df_res_1['STEMbackground'] - df_res_1['nonSTEMbackground'])
df_res_1 = df_res_1.sort_values(by='difference', ascending=False).drop(labels=['Respondent', 'HasBackground'])
df_res_1.head(20)

Unnamed: 0,STEMbackground,nonSTEMbackground,difference
Age1stCode,15.302807,17.896123,2.593316
YearsCode,13.410821,12.11837,1.292452
YearsCodePro,8.53845,8.399191,0.139259
MainBranch_I am a developer by profession,0.798551,0.668072,0.13048
"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",0.080411,0.171549,0.091138
EdLevel_Some college/university study without earning a degree,0.130768,0.203583,0.072815
"EdLevel_Bachelor’s degree (B.A., B.S., B.Eng., etc.)",0.510703,0.464489,0.046214
"EdLevel_Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",0.253946,0.213488,0.040458
"Employment_Independent contractor, freelancer, or self-employed",0.085362,0.125395,0.040033
Employment_Employed full-time,0.761557,0.722023,0.039534


## 2. Question: Which factors influence hourly wage? How to maximize it?

In [169]:
df_3 = df.dropna(subset=['ConvertedComp', 'WorkWeekHrs'], how='any')
df_3.shape

(33658, 61)

In [170]:
df_3.loc[:,'HourlyComp'] = df['ConvertedComp'].copy()/(50*df['WorkWeekHrs'].copy())
df_3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro,HourlyComp
7,8,I am a developer by profession,Yes,36.0,12,Yearly,116000.0,116000.0,United States,United States dollar,...,Appropriate in length,No,"Computer science, computer engineering, or sof...",Django;React.js;Vue.js,Flask,Just as welcome now as I felt last year,39.0,17,13,59.487179
9,10,I am a developer by profession,Yes,22.0,14,Yearly,25000.0,32315.0,United Kingdom,Pound sterling,...,Appropriate in length,No,Mathematics or statistics,Flask;jQuery,Flask;jQuery,Somewhat more welcome now than last year,36.0,8,4,17.952778
10,11,I am a developer by profession,Yes,23.0,13,Yearly,31000.0,40070.0,United Kingdom,Pound sterling,...,Appropriate in length,No,"Computer science, computer engineering, or sof...",Angular;Django;React.js,Angular;Angular.js;Django;React.js,Just as welcome now as I felt last year,40.0,10,2,20.035
11,12,I am a developer by profession,No,49.0,42,Monthly,1100.0,14268.0,Spain,European Euro,...,Appropriate in length,No,Mathematics or statistics,ASP.NET;jQuery,ASP.NET;jQuery,Just as welcome now as I felt last year,40.0,7,7,7.134
12,13,"I am not primarily a developer, but I write co...",Yes,53.0,14,Monthly,3000.0,38916.0,Netherlands,European Euro,...,Too long,No,,,,A lot less welcome now than last year,36.0,35,20,21.62


In [171]:
# choose relevant columns
df_4 = df_3[['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode',  'EdLevel', 'Employment', 'Gender', 'JobSat', 'OrgSize', 'PurchaseWhat', 'Trans', 'UndergradMajor', 'YearsCode', 'YearsCodePro', 'HourlyComp']]
# make some categorical variables numeric
df_4.loc[df_4['YearsCode'] == 'Less than 1 year', 'YearsCode'] = 0
df_4.loc[df_4['YearsCode'] == 'More than 50 years', 'YearsCode'] = 51
df_4.loc[df_4['YearsCodePro'] == 'Less than 1 year', 'YearsCodePro'] = 0
df_4.loc[df_4['YearsCodePro'] == 'More than 50 years', 'YearsCodePro'] = 51
df_4.loc[df_4['Age1stCode'] == 'Younger than 5 years', 'Age1stCode'] = 4
df_4.loc[df_4['Age1stCode'] == 'Older than 85', 'Age1stCode'] = 86
df_4['YearsCode'] = df_4['YearsCode'].astype(float)
df_4['YearsCodePro'] = df_4['YearsCodePro'].astype(float)
df_4['Age1stCode'] = df_4['Age1stCode'].astype(float)
df_4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_4['YearsCode'] = df_4['YearsCode'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_4['YearsCodePro'] = df_4['YearsCodePro'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try usin

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,EdLevel,Employment,Gender,JobSat,OrgSize,PurchaseWhat,Trans,UndergradMajor,YearsCode,YearsCodePro,HourlyComp
7,8,I am a developer by profession,Yes,36.0,12.0,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,Man,Slightly dissatisfied,20 to 99 employees,I have some influence,No,"Computer science, computer engineering, or sof...",17.0,13.0,59.487179
9,10,I am a developer by profession,Yes,22.0,14.0,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Employed full-time,Man,Very satisfied,2 to 9 employees,I have a great deal of influence,No,Mathematics or statistics,8.0,4.0,17.952778
10,11,I am a developer by profession,Yes,23.0,13.0,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,Man,Slightly dissatisfied,"10,000 or more employees",I have little or no influence,No,"Computer science, computer engineering, or sof...",10.0,2.0,20.035
11,12,I am a developer by profession,No,49.0,42.0,Some college/university study without earning ...,Employed full-time,Man,Very dissatisfied,2 to 9 employees,I have little or no influence,No,Mathematics or statistics,7.0,7.0,7.134
12,13,"I am not primarily a developer, but I write co...",Yes,53.0,14.0,"Secondary school (e.g. American high school, G...",Employed full-time,Man,Very satisfied,2 to 9 employees,I have some influence,No,,35.0,20.0,21.62


In [172]:
# Fill numeric columns with the mean
num_vars = df_4.select_dtypes(include=['float', 'int']).columns
for col in num_vars:
    df_4[col].fillna((df_4[col].mean()), inplace=True)
    
# Dummy the categorical variables
cat_vars = df_4.select_dtypes(include=['object']).copy().columns
for var in cat_vars:
    # for each cat add dummy var, drop original column
    df_4 = pd.concat([df_4.drop(var, axis=1), pd.get_dummies(df_4[var], prefix=var, prefix_sep='_')], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [173]:
# split into train and test set
y = df_4['HourlyComp']
X = df_4.drop(columns=['HourlyComp'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [174]:
# fit model
lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train, y_train)
y_pred = lm_model.predict(X_test)
r2_score(y_test, y_pred)

-0.008260005218442945

In [175]:
# find correlations by coefficients in linear model
coefs_df = pd.DataFrame()
coefs_df['est_int'] = X_train.columns
coefs_df['coefs'] = lm_model.coef_
coefs_df['abs_coefs'] = np.abs(lm_model.coef_)
coefs_df = coefs_df.sort_values('abs_coefs', ascending=False)
coefs_df.head(20)

Unnamed: 0,est_int,coefs,abs_coefs
7,Hobbyist_No,115631200000000.0,115631200000000.0
8,Hobbyist_Yes,115631200000000.0,115631200000000.0
5,MainBranch_I am a developer by profession,-33795050000000.0,33795050000000.0
6,"MainBranch_I am not primarily a developer, but...",-33795050000000.0,33795050000000.0
16,EdLevel_Secondary school (e.g. American high s...,-83.55474,83.55474
15,"EdLevel_Professional degree (JD, MD, etc.)",-64.28498,64.28498
37,OrgSize_2 to 9 employees,-62.34352,62.34352
20,"Employment_Independent contractor, freelancer,...",61.61569,61.61569
34,OrgSize_10 to 19 employees,-61.21245,61.21245
25,Gender_Woman;Man,-53.29907,53.29907


## 3. Question: How does experience change the way coders handle problems?

In [159]:
df_5 = df.dropna(subset=['YearsCode', 'NEWStuck'])
df_5.shape

(53085, 61)

In [160]:
df_5['NEWStuck'] = df_5['NEWStuck'].str.split(';', expand=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_5['NEWStuck'] = df_5['NEWStuck'].str.split(';', expand=False)


In [161]:
df_6 = df_5[['NEWStuck', 'YearsCode']]
df_6

Unnamed: 0,NEWStuck,YearsCode
0,"[Visit Stack Overflow, Go for a walk or other ...",36
1,"[Visit Stack Overflow, Go for a walk or other ...",7
4,"[Call a coworker or friend, Visit Stack Overfl...",15
5,"[Play games, Visit Stack Overflow, Watch help ...",6
7,"[Play games, Call a coworker or friend, Visit ...",17
...,...,...
64441,"[Visit Stack Overflow, Go for a walk or other ...",7
64446,"[Call a coworker or friend, Visit Stack Overfl...",6
64447,"[Call a coworker or friend, Watch help / tutor...",6
64453,[Watch help / tutorial videos],3


In [162]:
possible_answers = ['Call a coworker or friend',
'Visit Stack Overflow',
'Watch help / tutorial videos',
'Panic',
'Go for a walk or other physical activity',
'Meditate',
'Do other work and come back later', 'Play games']
def list_to_dummies(row):
    """ 
    splits elements in list in NEWStuck into dummy variables for each element
    """
    for answer in possible_answers:
        if answer in row['NEWStuck']:
            row[answer] = 1
        else:
            row[answer] = 0
    return row

df_6 = df_6.apply(list_to_dummies, axis=1)
df_6 = df_6.drop(columns=['NEWStuck'])

In [163]:
df_6.head()

Unnamed: 0,YearsCode,Call a coworker or friend,Visit Stack Overflow,Watch help / tutorial videos,Panic,Go for a walk or other physical activity,Meditate,Do other work and come back later,Play games
0,36,0,1,0,0,1,0,1,0
1,7,0,1,0,0,1,0,0,0
4,15,1,1,1,0,0,0,1,0
5,6,0,1,1,0,0,0,1,1
7,17,1,1,1,0,1,0,1,1


In [166]:
# make YearsCode numerical
df_6.loc[df_6['YearsCode'] == 'Less than 1 year', 'YearsCode'] = 0
df_6.loc[df_6['YearsCode'] == 'More than 50 years', 'YearsCode'] = 51

df_6['YearsCode'] = df_6['YearsCode'].astype(float)

# split into train and test set
y = df_6['YearsCode']
X = df_6.drop(columns=['YearsCode'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [167]:
# fit linear regression model
lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train, y_train)
y_pred = lm_model.predict(X_test)
r2_score(y_test, y_pred)

0.04092891948590782

In [168]:
# show correlation as coefficients in Linear Regression model
coefs_df = pd.DataFrame()
coefs_df['est_int'] = X_train.columns
coefs_df['coefs'] = lm_model.coef_
coefs_df['abs_coefs'] = np.abs(lm_model.coef_)
coefs_df = coefs_df.sort_values('abs_coefs', ascending=False)
coefs_df.head(20)

Unnamed: 0,est_int,coefs,abs_coefs
3,Panic,-2.395679,2.395679
2,Watch help / tutorial videos,-2.190738,2.190738
7,Play games,-2.10683,2.10683
4,Go for a walk or other physical activity,1.732896,1.732896
6,Do other work and come back later,1.5486,1.5486
5,Meditate,0.70723,0.70723
1,Visit Stack Overflow,-0.397707,0.397707
0,Call a coworker or friend,0.120678,0.120678
