## WomenTech Insights

In [142]:
# Import Dependencies
import pandas as pd
import numpy as np
import os

from selenium import webdriver
from selenium.webdriver.chrome.options import Options  
from selenium.webdriver.common.keys import Keys
import time

from bs4 import BeautifulSoup

In [227]:
# URL of Event and connection
url = "https://app.hopin.to/events/womentech-global-conference-2020/reception"

chrome_options = Options()  
driver = webdriver.Chrome(executable_path=os.path.abspath("chromedriver"), options=chrome_options) 
driver.get(url)
time.sleep(5)

# Create BeautifulSoup object
html = driver.page_source
soup = BeautifulSoup(html,'lxml')

# Build presentation titles content list
content_list = []
schedule = soup.find_all("div", class_="text -semi-bold mt-8")
for item in schedule:
    content = item.text
    content_list.append(content)
    
# Build presenter titles list
titles_list = []
titles = soup.find_all("p", class_="text -light mt-4")    
for element in titles:
    role = element.text
    titles_list.append(role)

In [228]:
# Delete company sessions and expos
for item in list(content_list):
    if "Session" in item:
        content_list.remove(item)
    elif "Expo" in item:
        content_list.remove(item)

In [288]:
print(f"There were {len(content_list)} panels during the conference excluding career and expo panels.")
print(f"There were {len(titles_list)} presenters at the conference.")

There were 298 panels during the conference excluding career and expo panels.
There were 301 presenters at the conference.


In [289]:
# Find most frequent words in presentation titles
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, punkt
from collections import Counter
import json

# Get rid of stopwords
drop_chars = ["a", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "you're", "you've", "you'll", "you'd", 
              "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "she's", "her", "hers", 
              "herself", "it", "it's", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", 
              "who", "whom", "this", "that", "that'll", "these", "those", "am", "is", "are", "was", "were", "be", "been", 
              "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", 
              "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", 
              "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", 
              "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", 
              "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", 
              "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't", "should", "should've", 
              "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "aren't", "couldn", "couldn't", "didn", "didn't", 
              "doesn", "doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "ma", "mightn", 
              "mightn't", "mustn", "mustn't", "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't", "wasn", "wasn't", 
              "weren", "weren't", "won", "won't", "the", "how", "wouldn", "wouldn't", "-", ":", "&", ",", "!", "'('", "(')'",
              "/", "|", "@"]

content_string = ' '.join(content_list)
content_split = content_string.split()

for word in content_split:
    if word.casefold() in drop_chars:
        content_split.remove(word)

counter_content = Counter(content_split)
most_freq_content = counter_content.most_common(25)
print(most_freq_content)

[('the', 34), ('a', 28), ('to', 22), ('Tech', 18), ('Power', 11), ('Data', 9), ('Technology', 9), ('new', 9), ('Career', 8), ('COVID-19', 8), ('Building', 8), ('Women', 8), ('Learning', 8), ('your', 8), ('AI', 7), ('Business', 7), ('Social', 6), ('Science', 6), ('Culture', 6), ('Future', 6), ('Leadership', 6), ('build', 6), ('tech', 6), ('journey', 5), ('Intelligence', 5)]


In [294]:
# Find most frequent titles of presenters
titles = []

for item in titles_list:
    titles.append(item.strip())

title_string = ' '.join(titles)
title_split = title_string.split()

for word in title_split:
    if word.casefold() in drop_chars:
        title_split.remove(word)

counter_titles = Counter(title_split)
most_freq_titles = counter_titles.most_common(10)
print("Most Common Words in Presenter Titles and Their Counts")
print("-------------------------------------")

print(most_freq_titles)

Most Common Words in Presenter Titles and Their Counts
-------------------------------------
[('CEO', 36), ('Founder', 34), ('Software', 33), ('Engineer', 29), ('Senior', 27), ('Director', 24), ('Data', 18), ('Manager', 16), ('Consultant', 16), ('Developer', 14)]
