# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
import nltk
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# Load data

In [None]:
clean_df = pd.read_csv("data/clean_data.csv")

# Breakdown of terms 

### Frequency

In [None]:
cols_to_plot = ["Experience","Skills","Stages","Objectives","Interests","Looking for"]

for idx,category in enumerate(cols_to_plot):
    # get all terms for this category from each user
    all_terms = []
    for i,val in clean_df[category].iteritems():
        if isinstance(val,str) and val != "NaN":
            all_terms.extend(literal_eval(val))
    
    # calculate frequency of terms
    freq = nltk.FreqDist(all_terms)
    sorted_freq = {k: v for k, v in sorted(freq.items(), key=lambda item: item[1])[::-1]}
    labels = list(sorted_freq.keys())
    term_freqs = [100*(term/len(all_terms)) for term in list(sorted_freq.values())]
    
    fig = px.pie(
        names=labels,
        values=term_freqs,
        title=category
    )
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.show()

In [None]:
skill_tags = [
    "Software Engineering",
    "Business Development",
    "Product Management",
    "Research",
    "Communications",
    "Data Science",
    "Operations",
    "Growth",
    "Analytics",
    "Marketing",
    "Product Management",
    "Recruiting",
    "Executives",
    "Sales",
    "Design",
    "Customer Service",
    "Finance",
    "Hardware Engineering",
]

### Skills vs. Looking for

In [None]:
# get all terms for this category from each user
all_skills = []
for i,val in clean_df["Skills"].iteritems():
    if isinstance(val,str) and val != "NaN":
        all_skills.extend(literal_eval(val)) 
# calculate frequency of terms
skills_freq = nltk.FreqDist(all_skills)
sorted_skills_freq = {k: v for k, v in sorted(freq.items(), key=lambda item: item[1])[::-1]}
skill_freq = {skill:100*(skills_freq[skill]/len(all_skills)) for skill in skills_freq}
skill_freq

In [None]:
# get all terms for this category from each user
all_looking = []
for i,val in clean_df["Looking for"].iteritems():
    if isinstance(val,str) and val != "NaN":
        all_looking.extend(literal_eval(val)) 
# calculate frequency of terms
looking_freq = nltk.FreqDist(all_looking)
sorted_looking_freq = {k: v for k, v in sorted(freq.items(), key=lambda item: item[1])[::-1]}
looking_freq = {term:100*(looking_freq[term]/len(all_looking)) for term in looking_freq}
looking_freq

In [None]:
for term in looking_freq:
    if term not in skills_freq:
        print(f"{term} not in skills")
for term in skills_freq:
    if term not in looking_freq:
        print(f"{term} not in looking for")

In [None]:
labels = list(skills_freq.keys())
df = pd.DataFrame({"Term":[], "%": [], "Category": []})
for label in labels:
    df = df.append(
        pd.DataFrame.from_dict({
            "Term":[label,label],
            "%":[skills_freq[label],looking_freq[label]],
            "Category":["Skill","Looking for"]
        }),ignore_index=True)
df

In [None]:
fig = plt.figure(figsize=(15,6))
sns.barplot(data=df,x="Term",y="%",hue="Category")
plt.xticks(rotation=45,fontsize=12)
plt.xlabel("")
plt.yticks(fontsize=12)
plt.ylabel("% of Responses",fontsize=14)
plt.show()

# Breakdown of users

In [None]:
loc_df = clean_df.copy().drop(columns=["Bio"]+cols_to_plot).astype(str)
loc_df["City"] = [row["Location"].split(",")[0].strip() if len(row["Location"].split(",")) > 1 else "Other" for idx,row in loc_df.iterrows()]
loc_df["Country"] = [row["Location"].split(",")[1].strip() if len(row["Location"].split(",")) > 1 else "Other" for idx,row in loc_df.iterrows()]

In [None]:
fig = px.pie(loc_df.groupby("Country",as_index=False).count(),values="Name",names="Country",title=f"{len(df)} Fellows")
fig.show()