# Imports

In [None]:
import pandas as pd
import numpy as np
from autocorrect import Speller
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from ast import literal_eval

# Load raw data

In [None]:
raw = pd.read_csv("data/edited_data.csv")
raw.head()

# Adjust columns

In [None]:
# remove unnamed columns
df = raw.loc[:, ~raw.columns.str.contains('^Unnamed')]

In [None]:
# rename some columns
df = df.rename(columns={"Experience Tags":"Experience","Can Help With":"Skills"})

In [None]:
# combine bio columns into one large column
df = df.rename(columns={"Bio":"Bio 1"})
bios = []
for idx,row in df.iterrows():
    full_bio = "".join([row[f"Bio {i}"] for i in range(1,9) if isinstance(row[f"Bio {i}"],str) and row[f"Bio {i}"]!="NaN"])
    bios.append(full_bio)
df = df.drop(columns=[f"Bio {i}" for i in range(1,9)])
df["Bio"] = bios
df.head()

In [None]:
# remove any duplicates
users = []
for idx, row in df.iterrows():
    user = row["Name"]
    if user in users:
        df = df.drop(idx)
        print(f"deleting duplicate of {user}")
    users.append(user)
df = df.reset_index(drop=True)
df

# Parse keywords

In [None]:
experience_tags = [
    "Enterprise",
    "Healthcare",
    "Marketplace",
    "Academia",
    "Analytics",
    "Biotech",
    "Ecommerce",
    "FinTech",
    "Future of Work",
    "Future of Food",
    "Insurance",
    "Science",
    "Blockchain/Crypto",
    "Direct to Consumer",
    "Consumer",
    "AI/ML",
    "B2B",
    "Climate",
    "Community",
    "Government",
    "SaaS",
    "Ecommerce",
    "Coaching",
    "Developer Tools",
    "Clean Technology",
    "IOT",
    "Productivity",
    "Real Estate",
    "Social",
    "Mental Health/Wellness",
    "Talent",
    "Hardware",
    "Education",
    "No-Code",
    "Gaming",
    "Transportation & Travel",
    "Venture",
    "Robotics",
    "People Ops",
    "VR/AR",
    "Drones",
    "Security",
    "Legal",
    "Social Impact",
    "Construction",
]

In [None]:
skill_tags = [
    "Software Engineering",
    "Business Development",
    "Product Management",
    "Research",
    "Communications",
    "Data Science",
    "Operations",
    "Growth",
    "Analytics",
    "Marketing",
    "Product Management",
    "Recruiting",
    "C-Suite Executives",
    "Sales",
    "Design",
    "Customer Service",
    "Finance",
    "Hardware Engineering",
]

In [None]:
stage_tags = [
    "Open to new ideas",
    "In early stages of exploring a specific idea",
    "Raising funding",
    "Built a basic prototype/MVP",
    "Starting to onboard customers",
    "Seed+"
]

In [None]:
objective_tags = [
    "Looking for co-founder",
    "Looking to join another team",
    "Looking for first employees"
]

In [None]:
interest_tags = experience_tags

In [None]:
looking_for_tags = skill_tags

In [None]:
all_tags = {
    "Experience": experience_tags,
    "Skills": skill_tags,
    "Stages": stage_tags,
    "Objectives": objective_tags,
    "Interests": interest_tags,
    "Looking for": looking_for_tags
}

In [None]:
# split into tokens/phrases
cols_to_parse = ["Experience", "Skills", "Stages", "Objectives", "Interests", "Looking for"]
for idx, row in df.iterrows():
    for col in cols_to_parse:
        try:
            if isinstance(row[col],str) and row[col] != "NaN":
                row[col] = list(np.unique([tag for tag in all_tags[col] if tag in row[col]]))
        except:
            print(f"error in row {idx}, column {col}")
df

# Save cleaned data

In [None]:
# cast everything as strings
df = df.replace(np.nan, "NaN", regex=True)

In [None]:
df.to_csv("data/clean_data.csv",index=False)

# Save all responses

In [None]:
df = df.astype(str)

In [None]:
cols_to_include = ["Experience","Skills","Interests","Stages","Objectives","Looking for"]
all_responses = {"Name":[],"Category":[],"Response":[]}
for idx,row in df.iterrows():
    name = row["Name"]
    for col in cols_to_include:
        if row[col] != "nan" and row[col] != "NaN":
            responses = literal_eval(row[col])
            for resp in responses:
                all_responses["Name"].append(name)
                all_responses["Category"].append(col)
                all_responses["Response"].append(resp)
all_responses = pd.DataFrame.from_dict(all_responses)
all_responses

In [None]:
all_responses.to_csv("data/response_data.csv",index=False)

# Replace phrases with keywords

In [None]:
phrase_map = {
    "Open to new ideas": "ideas",
    "In early stages of exploring a specific idea": "exploring",
    "Raising funding": "funding",
    "Built a basic prototype/MVP": "prototype",
    "Starting to onboard customers": "customers",
    "Seed+": "seed",
    "Looking for co-founder": "partner",
    "Looking to join another team": "team",
    "Looking for first employees": "employees"
}

cols = ["Stages","Objectives"]
for idx, row in df.iterrows():
    for col in cols:
        try:
            if isinstance(row[col],str) and row[col] != "nan" and row[col] != "NaN":
                row[col] = [phrase_map[tag] if tag in phrase_map else tag for tag in literal_eval(row[col])]
        except:
            print(f"error in row {idx}, column {col}")
df

# Convert/remove acronyms/jargon

In [None]:
jargon_map = {
    "AI": "Artificial Intelligence",
    "ML": "Machine Learning",
    "B2B": "Business to Business",
    "SaaS": "Software as a Service",
    "FinTech": "Financial Technology",
    "Biotech": "Biotechnology",
    "Crypto": "Cryptocurrency",
    "Ops": "Operations",
    "VR/AR": "Virtual Reality and Artificial Reality",
    "IOT": "Internet of Things",
    "MVP": "minimum viable product",
    "C-Suite": ""
}

In [None]:
for idx, row in df.iterrows():
    for col in cols_to_parse:
        try:
            if isinstance(row[col],str) and row[col]!="nan" and row[col]!="NaN":
                tags = [tag.replace("/"," and ") for tag in literal_eval(row[col])]
                row[col] = [jargon_map[term] if term in jargon_map else term for tag in tags for term in tag.split()]   
        except:
            print(f"error in row {idx}, column {col}")
df

# Process data
- tokenize
- remove non-letters
- convert to lowercase
- spell-check

In [None]:
spell = Speller()
stop_words = np.unique(stopwords.words('english') + ["the"])
for idx, row in df.iterrows():
    for col in cols_to_parse:
        try:
            if isinstance(row[col],list):
                tokens = wordpunct_tokenize(" ".join([term.lower() for term in row[col]]))
                row[col] = [spell(token) for token in tokens if token.isalpha() and token not in stop_words]
        except:
            print(f"error in row {idx}, column {col}")
df

# Save processed data

In [None]:
df.to_csv("data/processed_data.csv",index=False)