# Analysis of Diversity in Programming Language Communities

Data downloaded from here: <https://insights.stackoverflow.com/survey>

# Data Loading

In [8]:
from pathlib import Path
import pandas as pd
import requests
import zipfile
import io

pd.set_option("display.precision", 2)

In [13]:
DATASET_URL = 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2022.zip'
DATASET_FILE = 'survey_results_public.csv'
if not Path(DATASET_FILE).exists():
    r = requests.get(DATASET_URL)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extract(DATASET_FILE)

In [15]:
df = pd.read_csv(DATASET_PATH)
df.LanguageHaveWorkedWith = df.LanguageHaveWorkedWith.fillna('')

In [16]:
langs = set()
for ls in df.LanguageHaveWorkedWith:
    if ls != '':
        langs |= set(ls.split(';'))

# Data Processing

In [17]:
div = []
for lang in langs:
    dfl = df[df.LanguageHaveWorkedWith.str.contains(lang, regex=False)]
    
    N = len(dfl)
    cis = dfl.Trans[dfl.Trans == 'No'].count()
    male = dfl.Gender[dfl.Gender == 'Man'].count()
    white = (dfl.Ethnicity.str.contains('White') | dfl.Ethnicity.str.contains('European')).sum()
    
    div.append({
        'lang': lang,
        'N': N,
        'white': white/N,
        'male': male/N,
        'cis': cis/N
    })
div = pd.DataFrame(div)
div

Unnamed: 0,lang,N,white,male,cis
0,APL,504,0.45,0.73,0.72
1,Ruby,4299,0.66,0.88,0.93
2,Clojure,1070,0.69,0.89,0.91
3,Objective-C,1698,0.61,0.9,0.92
4,Dart,4648,0.48,0.92,0.94
5,Perl,1644,0.69,0.87,0.89
6,LISP,932,0.7,0.85,0.87
7,R,12996,0.68,0.89,0.93
8,TypeScript,24752,0.64,0.91,0.95
9,Rust,6625,0.71,0.89,0.91


# Data Analysis

In [18]:
for k in ['white', 'male', 'cis']:
    print(f'lowest % of: {k}')
    print(div.sort_values(k).iloc[:10])    
    print('\n')

lowest % of: white
        lang      N  white  male   cis
0        APL    504   0.45  0.73  0.72
36  Solidity   1031   0.46  0.90  0.91
4       Dart   4648   0.48  0.92  0.94
22       SAS    435   0.52  0.81  0.85
34    MATLAB   2913   0.56  0.88  0.93
29       PHP  14827   0.59  0.90  0.94
28   Crystal    340   0.59  0.81  0.81
14       C++  16024   0.60  0.90  0.93
26      Java  53553   0.60  0.90  0.94
25  HTML/CSS  39142   0.60  0.90  0.94


lowest % of: male
       lang     N  white  male   cis
0       APL   504   0.45  0.73  0.72
22      SAS   435   0.52  0.81  0.85
28  Crystal   340   0.59  0.81  0.81
12    OCaml   422   0.66  0.83  0.83
10    COBOL   464   0.66  0.84  0.85
6      LISP   932   0.70  0.85  0.87
15  Haskell  1577   0.70  0.85  0.89
37   Erlang   641   0.68  0.85  0.87
35  Fortran   646   0.69  0.86  0.87
23    Julia  1084   0.63  0.86  0.88


lowest % of: cis
       lang     N  white  male   cis
0       APL   504   0.45  0.73  0.72
28  Crystal   340   0.59  0.81  