# Extract data from the StackOverflow Survey 2022

## Load the downloaded HTML file

In [2]:
from bs4 import BeautifulSoup

with open('./data/Stack Overflow Developer Survey 2022.html') as fp:
    soup = BeautifulSoup(fp, 'html.parser')

print(soup.head.contents)


['\n', <meta charset="utf-8"/>, '\n', <title>Stack Overflow Developer Survey 2022</title>, '\n', <script src="https://cdn.cookielaw.org/consent/d06435e0-17fb-4659-9a14-1930a6e0be80/OtAutoBlock.js"></script>, '\n', <script charset="utf-8" data-domain-script="d06435e0-17fb-4659-9a14-1930a6e0be80" src="https://cdn.cookielaw.org/scripttemplates/otSDKStub.js"></script>, '\n', <script>function OptanonWrapper(){}</script>, '\n', <meta content="width=device-width, height=device-height, initial-scale=1.0, minimum-scale=1.0" name="viewport"/>, '\n', <meta content="In May 2022 over 70,000 developers told us how they learn and level up, which tools they’re using, and what they want." name="description"/>, '\n', <meta content="summary_large_image" name="twitter:card"/>, '\n', <meta content="@stackoverflow" name="twitter:site"/>, '\n', <meta content="Stack Overflow Developer Survey 2022" name="twitter:title"/>, '\n', <meta content="In May 2022 over 70,000 developers told us how they learn and level 

## Extract programming, scripting, and markup languages

In [3]:
programming_scripting_markup_languages = []

programming_languages_table = soup.find('table', id='languageepiif')
# print(programming_languages_table)

programming_languages_labels = programming_languages_table.find_all('td', class_='label')
# print(programming_languages_labels)

for programming_languages_label in programming_languages_labels:
    programming_scripting_markup_languages.append(programming_languages_label.contents[0])

print(programming_scripting_markup_languages)

['JavaScript', 'HTML/CSS', 'SQL', 'Python', 'TypeScript', 'Java', 'Bash/Shell', 'C#', 'C++', 'PHP', 'C', 'PowerShell', 'Go', 'Rust', 'Kotlin', 'Dart', 'Ruby', 'Assembly', 'Swift', 'R', 'VBA', 'MATLAB', 'Lua', 'Groovy', 'Delphi', 'Scala', 'Objective-C', 'Perl', 'Haskell', 'Elixir', 'Julia', 'Clojure', 'Solidity', 'LISP', 'F#', 'Fortran', 'Erlang', 'APL', 'COBOL', 'SAS', 'OCaml', 'Crystal']


## Extract databases

In [4]:
databases = []

databases_table = soup.find('table', id='databaseyuhmf')
# print(databases_table)

databases_labels = databases_table.find_all('td', class_='label')
# print(databases_labels)

for databases_label in databases_labels:
    databases.append(databases_label.contents[0])

print(databases)

['MySQL', 'PostgreSQL', 'SQLite', 'MongoDB', 'Microsoft SQL Server', 'Redis', 'MariaDB', 'Elasticsearch', 'Oracle', 'Firebase Realtime Database', 'DynamoDB', 'Cloud Firestore', 'Cassandra', 'Neo4j', 'IBM DB2', 'Couchbase', 'CouchDB']


## Extract cloud platforms

In [5]:
cloud_platforms = []

cloud_platforms_table = soup.find('table', id='platformievhy')
# print(cloud_platforms_table)

cloud_platforms_labels = cloud_platforms_table.find_all('td', class_='label')
# print(cloud_platforms_labels)

for cloud_platforms_label in cloud_platforms_labels:
    cloud_platforms.append(cloud_platforms_label.contents[0])

print(cloud_platforms)

['AWS', 'Microsoft Azure', 'Google Cloud', 'Firebase', 'Heroku', 'DigitalOcean', 'VMware', 'Managed Hosting', 'Linode', 'OVH', 'Oracle Cloud Infrastructure', 'OpenStack', 'IBM Cloud or Watson', 'Colocation']


## Extract web frameworks and technologies

In [6]:
web_frameworks_and_technologies = []

web_frameworks_and_technologies_table = soup.find('table', id='webframew4ni7')
# print(web_frameworks_and_technologies_table)

web_frameworks_and_technologies_labels = web_frameworks_and_technologies_table.find_all('td', class_='label')
# print(web_frameworks_and_technologies_labels)

for web_frameworks_and_technologies_label in web_frameworks_and_technologies_labels:
    web_frameworks_and_technologies.append(web_frameworks_and_technologies_label.contents[0])

print(web_frameworks_and_technologies)

['Node.js', 'React.js', 'jQuery', 'Express', 'Angular', 'Vue.js', 'ASP.NET Core', 'ASP.NET', 'Django', 'Flask', 'Next.js', 'Laravel', 'Angular.js', 'FastAPI', 'Ruby on Rails', 'Svelte', 'Blazor', 'Nuxt.js', 'Symfony', 'Gatsby', 'Drupal', 'Phoenix', 'Fastify', 'Deno', 'Play Framework']


## Extract other frameworks and libraries

In [7]:
other_frameworks_and_libraries = []

other_frameworks_and_libraries_table = soup.find('table', id='misc-techyxc3f')
# print(other_frameworks_and_libraries_table)

other_frameworks_and_libraries_labels = other_frameworks_and_libraries_table.find_all('td', class_='label')
# print(other_frameworks_and_libraries_labels)

for other_frameworks_and_libraries_label in other_frameworks_and_libraries_labels:
    other_frameworks_and_libraries.append(other_frameworks_and_libraries_label.contents[0])

print(other_frameworks_and_libraries)

['.NET', 'NumPy', 'Pandas', 'Spring', 'TensorFlow', 'Flutter', 'Scikit-learn', 'React Native', 'Apache Kafka', 'Electron', 'Torch/PyTorch', 'Qt', 'Keras', 'Ionic', 'Xamarin', 'Apache Spark', 'Cordova', 'Hadoop', 'GTK', 'Capacitor', 'Tidyverse', 'Hugging Face Transformers', 'Uno Platform']


## Extract other tools

In [8]:
other_tools = []

other_tools_table = soup.find('table', id='tools-techzn2jb')
# print(other_tools_table)

other_tools_labels = other_tools_table.find_all('td', class_='label')
# print(other_tools_labels)

for other_tools_label in other_tools_labels:
    other_tools.append(other_tools_label.contents[0])

print(other_tools)

['npm', 'Docker', 'Yarn', 'Homebrew', 'Kubernetes', 'Terraform', 'Unity 3D', 'Ansible', 'Unreal Engine', 'Puppet', 'Chef', 'Pulumi', 'Flow']


## Export data to CSV files

In [9]:
import csv

export_jobs = [
    {
        'headers': ['Programming, scripting, and markup language'],
        'data_list': programming_scripting_markup_languages,
    },
    {
        'headers': ['Databases'],
        'data_list': databases,
    },
    {
        'headers': ['Cloud platforms'],
        'data_list': cloud_platforms,
    },
    {
        'headers': ['Web frameworks and technologies'],
        'data_list': web_frameworks_and_technologies,
    },
    {
        'headers': ['Other frameworks and libraries'],
        'data_list': other_frameworks_and_libraries,
    },
    {
        'headers': ['Other tools'],
        'data_list': other_tools,
    }
]

total_skills = 0
for export_job in export_jobs:
    headers = export_job['headers']
    data_list = export_job['data_list']
    total_skills += len(data_list)

    with open(f'./data/Stack_Overflow_Survey_2022/{headers[0]}.csv', encoding='UTF-8', mode='w+') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(headers)

        for data in data_list:
            csv_writer.writerow([data])

print(total_skills)

134
