In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier as KN
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as a_s
from sklearn.model_selection import KFold

In [2]:
data = pd.read_csv("2016-FCC-New-Coders-Survey-Data.csv", low_memory=False)

In [3]:
len(data)

15620

In [4]:
data.columns[:24]

Index(['Age', 'AttendedBootcamp', 'BootcampFinish', 'BootcampFullJobAfter',
       'BootcampLoanYesNo', 'BootcampMonthsAgo', 'BootcampName',
       'BootcampPostSalary', 'BootcampRecommend', 'ChildrenNumber',
       'CityPopulation', 'CodeEventBootcamp', 'CodeEventCoffee',
       'CodeEventConferences', 'CodeEventDjangoGirls', 'CodeEventGameJam',
       'CodeEventGirlDev', 'CodeEventHackathons', 'CodeEventMeetup',
       'CodeEventNodeSchool', 'CodeEventNone', 'CodeEventOther',
       'CodeEventRailsBridge', 'CodeEventRailsGirls'],
      dtype='object')

In [5]:
data.columns[24:]

Index(['CodeEventStartUpWknd', 'CodeEventWomenCode', 'CodeEventWorkshop',
       'CommuteTime', 'CountryCitizen', 'CountryLive', 'EmploymentField',
       'EmploymentFieldOther', 'EmploymentStatus', 'EmploymentStatusOther',
       'ExpectedEarning', 'FinanciallySupporting', 'Gender', 'HasChildren',
       'HasDebt', 'HasFinancialDependents', 'HasHighSpdInternet',
       'HasHomeMortgage', 'HasServedInMilitary', 'HasStudentDebt',
       'HomeMortgageOwe', 'HoursLearning', 'ID.x', 'ID.y', 'Income',
       'IsEthnicMinority', 'IsReceiveDiabilitiesBenefits', 'IsSoftwareDev',
       'IsUnderEmployed', 'JobApplyWhen', 'JobPref', 'JobRelocateYesNo',
       'JobRoleInterest', 'JobRoleInterestOther', 'JobWherePref',
       'LanguageAtHome', 'MaritalStatus', 'MoneyForLearning',
       'MonthsProgramming', 'NetworkID', 'Part1EndTime', 'Part1StartTime',
       'Part2EndTime', 'Part2StartTime', 'PodcastChangeLog',
       'PodcastCodeNewbie', 'PodcastCodingBlocks', 'PodcastDeveloperTea',
       'Pod

In [16]:
len(data.AttendedBootcamp[data.AttendedBootcamp == True])

953

In [7]:
def gender_to_number(x):
    if x.Gender == 'male':
        return 1
    elif x.Gender == 'female':
        return 0

In [8]:
data.CityPopulation.unique()

array(['between 100,000 and 1 million', 'more than 1 million',
       'less than 100,000', nan], dtype=object)

In [9]:
def city_to_number(x):
    if x.CityPopulation == 'less than 100,000':
        return 1
    elif x.CityPopulation == 'between 100,000 and 1 million':
        return 2
    elif x.CityPopulation == 'more than 1 million':
        return 3

In [10]:
data["gender_num"] = [gender_to_number(data.loc[i]) for i in range(len(data))]

In [11]:
data["city_size"] = [city_to_number(data.loc[i]) for i in range(len(data))]

In [12]:
print(len(data[data.city_size == 1]))
print(len(data[data.city_size == 2]))
print(len(data[data.city_size == 3]))

3155
4917
5558


In [17]:
data_gender = data[(data["Gender"] == 'male') | (data["Gender"] == 'female')]
len(data_gender)

13606

In [None]:
data1 = data_gender[["gender_num", "Income", "MonthsProgramming", "city_size"]].dropna()
len(data1)

x = data1.Income
y = data1.gender_num
plt.scatter(x, y)
plt.show()

In [None]:
data_month_prog_less_36 = data1[data1.MonthsProgramming <= 12]
print(len(data_month_prog_less_36))

x = data_month_prog_less_36.Income
y = data_month_prog_less_36.gender_num
plt.scatter(x, y)
plt.show()

In [None]:
data_month = data1[data1.MonthsProgramming <= 24]
print(len(data_month_prog_less_36))
x = data_month.MonthsProgramming
y = data_month.Income
plt.scatter(x, y)
plt.show()

In [None]:


x = data1.MonthsProgramming
y = data1.gender_num
plt.scatter(x, y)
plt.show()

In [None]:
data.BootcampFullJobAfter.unique()

In [None]:
len(data.BootcampFullJobAfter[(data.BootcampFullJobAfter == 1.0) | (data.BootcampFullJobAfter == 0.0)])

In [None]:
len(data.BootcampMonthsAgo[data.BootcampMonthsAgo > 0])

In [None]:
data.JobApplyWhen.unique()

In [None]:
data.EmploymentStatus.unique()

In [None]:
data.BootcampFinish.unique()

In [None]:
len(data.BootcampFinish[(data.BootcampFinish == 1.0) | (data.BootcampFinish == 0.0)])

In [None]:
len(data.BootcampPostSalary[data.BootcampPostSalary > 0])

In [None]:
data3 = data[["CityPopulation", "BootcampFullJobAfter"]].dropna()

In [None]:
#x = data3.CityPopulation
#y = data3.BootcampPostSalary
#plt.scatter(x, y)
#plt.show()

In [None]:
data4 = data[["Gender", "Income"]].dropna()

In [None]:
len(data4)

In [None]:
data.Gender.unique()

In [None]:
data4_1 = data4[data4["Gender"] == 'male']

In [None]:
len(data4_1)

In [None]:
data4_0 = data4[data4["Gender"] == 'female']

In [None]:
len(data4_0)

In [None]:
plt.figure(figsize=(3, 3))
plt.hist(data4_1.Income, bins=np.arange(0, 200000, 5000))
plt.title("Male")
plt.show()

In [None]:
plt.figure(figsize=(3, 3))
plt.hist(data4_0.Income, bins=np.arange(0, 200000, 5000))
plt.title("Female")
plt.show()

In [None]:
data4_1.describe()

In [None]:
data4_0.describe()

In [None]:
tstat, pval = ttest_ind(data4_1.Income, data4_0.Income, equal_var=False)
print(tstat, pval)

In [None]:
data.SchoolDegree.unique()

In [None]:
data_dev = data[["MonthsProgramming", "IsSoftwareDev", "gender_num"]].dropna()

In [None]:
len(data_dev[(data_dev.IsSoftwareDev == 1.0) & (data.MonthsProgramming < 6)& (data.gender_num == 1)])

In [None]:
len(data_dev[data_dev.MonthsProgramming < 12])

In [None]:
a = len(data[(data.CodeEventMeetup == 1) & (data.AttendedBootcamp == 1)])
b = len(data[(data.CodeEventMeetup == 1)]) 
a / b


In [None]:
a = len(data[(data.CodeEventMeetup == 1) & (data.AttendedBootcamp == 0)])
b = len(data[data.CodeEventMeetup == 1])
a / b

In [None]:
events = []
for col in data.columns:
    if col.startswith("CodeEvent"):
        events.append(col)

In [None]:
print(events, len(events))

In [None]:
data_events = data[events]

In [None]:
def attend_events(x):
    if x.any() == 1:
        return 1
    else:
        return 0

In [None]:
event_column = [attend_events(data_events.loc[i]) for i in range(len(data_events))]

In [None]:
event_column.count(1)

In [None]:
len(data_events)

In [None]:
data["attend_events"] = event_column

In [None]:
fem_event = data[data.Gender == 'female']

fem_attend_event = len(fem_event[fem_event.attend_events == 1]) / len(fem_event)
print(fem_attend_event)

In [None]:
male_event = data[data.Gender == 'male']

male_attend_event = len(male_event[male_event.attend_events == 1]) / len(male_event)
print(male_attend_event)

In [20]:
bc_all = data[["AttendedBootcamp", "attend_events"]].dropna()
bc_all = bc_all.reset_index(drop=True)
bc_event = bc_all[data.AttendedBootcamp == 1]
bc_attend_event = len(bc_event[bc_event.attend_events == 1]) / len(bc_event)
print(bc_attend_event)

KeyError: "['attend_events'] not in index"

### 4% of men attended bc, and 11% of women

### country live vs gender

In [18]:
gender_all = data[["CountryLive", "gender_num"]].dropna()
gender_all = gender_all.reset_index(drop=True)
gender_men = gender_all[gender_all.gender_num == 1]
men_usa = len(gender_men[gender_men.CountryLive == "United States of America"]) / len(gender_men)
print(men_usa)

0.41127741692553893


In [19]:
gender_fem = gender_all[gender_all.gender_num == 0]
fem_usa = len(gender_fem[gender_fem.CountryLive == "United States of America"]) / len(gender_fem)
print(fem_usa)

0.6362986780993212


### age vs gender

In [None]:
gender_al = data[["CodeEventMeetup", "gender_num"]].dropna()
gender_al = gender_al.reset_index(drop=True)
gender_men = gender_al[gender_al.gender_num == 1]
men_attend_bc = len(gender_men[gender_men.CodeEventMeetup == 1]) / len(gender_men)
print(men_attend_bc)

In [None]:
len(data[data.CodeEventMeetup == 1])

In [None]:
data_city = data[data.gender_num == 0]
city_attend_event = len(data_city[data_city.AttendedBootcamp == 1]) / len(data_city)
print(city_attend_event)

In [None]:
data_city = data[data.city_size == 2]
city_attend_event = len(data_city[data_city.gender_num == 0]) / len(data_city)
print(city_attend_event)

In [None]:
data_city = data[data.city_size == 3]
city_attend_event = len(data_city[data_city.gender_num == 1]) / len(data_city)
print(city_attend_event)

In [None]:
bins=np.arange(0, 80, 2)
data_gen_ag = data[data.CountryLive == "United States of America"]
plt.hist(data_gen_ag.Age.dropna(), bins=bins)
plt.title('USA Age distribution')
plt.show()

In [None]:
data_gen_ag = data[data.Gender == "male"]
data_gen_ag = data[data.CountryLive != "United States of America"]
plt.hist(data_gen_ag.Age.dropna(), bins=bins)
plt.title('Male Age distribution')
plt.show()

In [None]:
data_cleaned = data[["Age", "CountryLive", "gender_num"]].dropna()
data_cleaned = data_cleaned.reset_index(drop=True)

In [None]:
def classify_country(x):
    """Create 2 classes from CountryLive, USA = 1, not USA = 0"""
    if x.CountryLive == "United States of America":
        return 1
    else:
        return 0

data_cleaned["country_num"] = [classify_country(data_cleaned.loc[i]) for i in range(len(data_cleaned))]

### All ages

In [None]:
scaler = MinMaxScaler()

data_cleaned["age_scaled"] = scaler.fit_transform(data_cleaned.Age.reshape(-1, 1))
data_cleaned["country_scaled"] = scaler.fit_transform(data_cleaned.country_num.reshape(-1, 1))

x_train, x_test, y_train, y_test = train_test_split(
    data_cleaned[["country_scaled"]],
    data_cleaned.gender_num,
    test_size=0.33
)

clf = KN(n_neighbors=4)

clf.fit(x_train, y_train)

y_model = clf.predict(x_test)

a_s(y_test, y_model)

### Cross Validation

In [None]:
kf = KFold(n_splits=5, shuffle=True)

kf.get_n_splits(data_cleaned)

for train_index, test_index in kf.split(data_cleaned):
    clf = KN(n_neighbors=34)
    X_train = data_cleaned[["country_scaled"]].loc[train_index]
    X_test = data_cleaned[["country_scaled"]].loc[test_index]

    y_train = data_cleaned.gender_num.loc[train_index]
    y_test = data_cleaned.gender_num.loc[test_index]
    clf.fit(X_train, y_train)
    y_model = clf.predict(X_test)
    print("accuracy: ", a_s(y_test, y_model))

### Ages < 20

In [None]:
scaler = MinMaxScaler()
data_cleaned_30 = data_cleaned[data_cleaned.Age < 20]
data_cleaned_30["age_scaled"] = scaler.fit_transform(data_cleaned_30.Age.reshape(-1, 1))
data_cleaned_30["country_scaled"] = scaler.fit_transform(data_cleaned_30.country_num.reshape(-1, 1))

x_train, x_test, y_train, y_test = train_test_split(
    data_cleaned_30[["age_scaled"]],
    data_cleaned_30.gender_num,
    test_size=0.33
)

clf = KN(n_neighbors=4)

clf.fit(x_train, y_train)

y_model = clf.predict(x_test)

a_s(y_test, y_model)

In [None]:
data_cleaned_inc = data[["Age", "Income", "gender_num", "CountryLive"]].dropna()
data_cleaned_inc = data_cleaned_inc.reset_index(drop=True)

In [None]:
def classify_country(x):
    """Create 2 classes from CountryLive, USA = 1, not USA = 0"""
    if x.CountryLive == "United States of America":
        return 1
    else:
        return 0

data_cleaned_inc["country_num"] = [classify_country(data_cleaned_inc.loc[i]) for i in range(len(data_cleaned_inc))]

In [None]:
data_cleaned_inc.columns

In [14]:
scaler = MinMaxScaler()
data_cleaned_inc["age_scaled"] = scaler.fit_transform(data_cleaned_inc.Age.reshape(-1, 1))
data_cleaned_inc["income_scaled"] = scaler.fit_transform(data_cleaned_inc.Income.reshape(-1, 1))
data_cleaned_inc["country_scaled"] = scaler.fit_transform(data_cleaned_inc.country_num.reshape(-1, 1))

x_train, x_test, y_train, y_test = train_test_split(
    data_cleaned_inc[["income_scaled"]],
    data_cleaned_inc.gender_num,
    test_size=0.33
)

clf = KN(n_neighbors=4)

clf.fit(x_train, y_train)

y_model = clf.predict(x_test)

a_s(y_test, y_model)

NameError: name 'data_cleaned_inc' is not defined

In [None]:

print("USA:")
print("Women average age: {} years, std.dev = {}".format(round(women_age_avg_usa, 1), round(women_age_std_usa, 1)))
print("Men average age: {} years, std.dev = {}".format(round(men_age_avg_usa, 1), round(men_age_std_usa, 1)))
print("Difference in average age:" , round(difference_age_usa, 1))
print()
print('USA age difference for males vs. females, t-Test results:')
print('t-stat = {}, pval = {}'.format(round(tstat_usa, 1), round(pval_usa, 4)))
print()
print("The rest of the world:")
print("Women average age: {} years, std.dev = {}".format(round(women_age_avg_other, 1), round(women_age_std_other, 1)))
print("Men average age: {} years, std.dev = {}".format(round(men_age_avg_other, 1), round(men_age_std_other, 1)))
print("Difference in average age:" , round(difference_age_other, 1))
print()
print('t-Test results:')
print('t-stat = {}, pval = {}'.format(round(tstat_other, 1), round(pval_other, 4)))

In [None]:
# Investigate difference in age between men and women
# Avg age  and std.dev by gender
data_male_age = data_age_gender[data_age_gender.Gender == 'male'].Age
data_fem_age = data_age_gender[data_age_gender.Gender == 'female'].Age

women_age_avg = data_fem_age.mean()
men_age_avg = data_male_age.mean()
difference_age = women_age_avg - men_age_avg

women_age_std = data_fem_age.std()
men_age_std = data_male_age.std()

tstat, pval = ttest_ind(data_male_age, data_fem_age, equal_var=False)
print("1. Is there a difference in age between men and women who is actively learning how to code?")
print()
print("Women average age: {} years, std.dev = {}".format(round(women_age_avg, 1), round(women_age_std, 1)))
print("Men average age: {} years, std.dev = {}".format(round(men_age_avg, 1), round(men_age_std, 1)))
print("Difference in average age:" , round(difference_age, 1))
print()
print('t-Test results:')
print('t-stat = {}, pval = {}'.format(round(tstat, 1), round(pval, 4)))

# Visualize the age distribution for men and women

bins=np.arange(0, 80, 2)
plt.figure(figsize=(7, 3))
plt.hist(data_male_age, color='green', bins=bins, label="Male")
plt.hist(data_fem_age, color='purple', alpha = 0.5, bins=bins, label="Female")

plt.title("Age distribution for Female and Male Participants")
plt.xlabel("Age")
plt.ylabel("Number of participant")
plt.legend()
plt.show()

########################################################
# Investigate difference in income between men and women
# Avg income  and std.dev by gender
data_male_inc = data_inc_gender[data_inc_gender.Gender == 'male'].Income
data_fem_inc = data_inc_gender[data_inc_gender.Gender == 'female'].Income

women_inc_avg = data_fem_inc.mean()
men_inc_avg = data_male_inc.mean()

difference_inc = men_inc_avg - women_inc_avg

women_inc_std = data_fem_inc.std()
men_inc_std = data_male_inc.std()

tstat, pval = ttest_ind(data_male_inc, data_fem_inc, equal_var=False)
print("2. Is there a difference in income between men and women who is actively learning how to code?")
print()
print("Women average income: ${}, std.dev = {}".format(round(women_inc_avg, 1), round(women_inc_std, 1)))
print("Men average income: ${}, std.dev = {}".format(round(men_inc_avg, 1), round(men_inc_std, 1)))
print("Difference in avegare income:" , round(difference_inc, 1))
print()
print('t-Test results:')
print('t-stat = {}, pval = {}'.format(round(tstat, 1), round(pval, 4)))

# Visualize the age distribution for men and women
bins=np.arange(0, 200000, 10000)
plt.figure(figsize=(7, 3))
plt.hist(data_male_inc, color='green', bins=bins, label="Male")
plt.hist(data_fem_inc, color='purple', alpha = 0.5, bins=bins, label="Female")

plt.title("Income distribution for Female and Male Participants")
plt.xlabel("Income, USD")
plt.ylabel("Number of participant")
plt.legend()
plt.show()
