In [2]:
import json
import pandas as pd
import numpy as np
import random
from bidict import bidict
import pickle
from collections import Counter
import re
from pprint import pprint

In [3]:
#raw_df = pd.read_pickle("/data/rali7/Tmp/solimanz/data/pickles/clean_aug.pkl")
raw_df = pd.read_pickle("/data/rali7/Tmp/solimanz/data/pickles/2017_11_28.pkl")

In [4]:
def fmtcols(mylist, cols):
    maxwidth = max(map(lambda x: len(x), mylist))
    justifyList = list(map(lambda x: x.ljust(maxwidth), mylist))
    lines = (' '.join(justifyList[i:i+cols]) for i in range(0,len(justifyList),cols))
    return "\n".join(lines)

In [5]:
def apply_transforms(s):
    for transform in transforms:
        s = re.sub(*transform, s)
    return s

In [6]:
def isEnglish(s):
    try:
        s.encode('ascii')
    except UnicodeEncodeError:
        return False
    else:
        return True

In [7]:
transforms = [
    # senior/junior
    (re.compile(r'\bsr\.(?=[a-z0-9])'), 'senior '),
    (re.compile(r'\bjr\.(?=[a-z0-9])'), 'junior '),
    (re.compile(r'\bsr\.(?!\S)'), 'senior'),
    (re.compile(r'\bjr\.(?!\S)'), 'junior'),
    (re.compile(r'\bsenior\.(?!\S)'), 'senior'),
    (re.compile(r'\bjunior\.(?!\S)'), 'junior'),
    (re.compile(r'\bsenior\.(?=[a-z0-9])'), 'senior '),
    (re.compile(r'\bjunior\.(?=[a-z0-9])'), 'junior '),
    (re.compile(r'\bsr\b'), 'senior'),
    (re.compile(r'\bjr\b'), 'junior'),
    # IT
    (re.compile(r'\bi\.*t\.*(?=[a-z0-9])'), 'information_technology '),
    (re.compile(r'\bi\.*t\.*(?!\S)'), 'information_technology'),
    # C*O
    (re.compile(r'\bc\.*e\.*o\.*(?!\S)'), 'chief_executive_officer'),
    (re.compile(r'\bc\.*o\.*o\.*(?!\S)'), 'chief_operating_officer'),
    (re.compile(r'\bc\.*t\.*o\.*(?!\S)'), 'chief_technology_officer'),
    (re.compile(r'\bc\.*f\.*o\.*(?!\S)'), 'chief_finance_officer'),
    (re.compile(r'\bchief financial officer\b'), 'chief_finance_officer'),
    (re.compile(r'\bchief operations officer\b'), 'chief_operating_officer'),
    
    (re.compile(r'\bceo/chief executive officer\b'), 'chief_executive_officer'),
    (re.compile(r'\bcoo/chief operating officer\b'), 'chief_operating_officer'),
    (re.compile(r'\bcto/chief technology officer\b'), 'chief_technology_officer'),
    (re.compile(r'\bcfo/chief finance officer\b'), 'chief_finance_officer'),
    
    (re.compile(r'\bchief executive officer\b'), 'chief_executive_officer'),
    (re.compile(r'\bchief operating officer\b'), 'chief_operating_officer'),
    (re.compile(r'\bchief technology officer\b'), 'chief_technology_officer'),
    (re.compile(r'\bchief finance officer\b'), 'chief_finance_officer'),
    # VP
    (re.compile(r'\bv\.*p\.(?=[a-z0-9])'), 'vice_president '),
    (re.compile(r'\bv\.*p\.*(?!\S)'), 'vice_president'),
    (re.compile(r'\bvice-president\b'), 'vice_president'),
    (re.compile(r'\bvice president\b'), 'vice_president'),
    #technician vs tech
    (re.compile(r'\btech\.(?=[a-z0-9])'), 'technician '),
    (re.compile(r'\btech\.*(?!\S)'), 'technician'),
    # cofounder
    (re.compile(r'\bco(-|\s)founder\b'), 'co_founder'),
    # co-...
    (re.compile(r'\bco(-|\s)'), 'co_'),
    #coop
    (re.compile(r'\bco[-|\s]op\b'), 'coop'),
    #addon
    (re.compile(r'\badd[-|\s]on\b'), 'addon'),
    # Nurses
    (re.compile(r'\br\.*n\.*(?!\S)'), 'registered nurse'),
    (re.compile(r'\br\.*n\.(?=[a-z0-9])'), 'registered nurse '),
    (re.compile(r'\br\.*p\.*n\.*(?!\S)'), 'registered practical nurse'),
    (re.compile(r'\br\.*p\.*n\.(?=[a-z0-9])'), 'registered practical nurse '),
    #T.A.
    (re.compile(r"\bt\.*a\.*(?!\S)"), "teaching assistant"),
    (re.compile(r"\bt\.*a\.(?=[a-z0-9])"), "teaching assistant "),
    (re.compile(r"\bteacher assistant\b"), "teaching assistant"),
    (re.compile(r"\bteacher's assistant\b"), "teaching assistant"),
    (re.compile(r"\bteacher's assitant\b"), "teaching assistant"),
    (re.compile(r"\bteacher's assistant\b"), "teaching assistant"),
    (re.compile(r"\bteacher's assitant\b"), "teaching assistant"),
    # HR
    (re.compile(r'\bh\.*r\.(?=[a-z0-9])'), 'human_resources '),
    (re.compile(r'\bh\.*r\.*(?!\S)'), 'human_resources'),
    # Customer service reps
    (re.compile(r'\bc\.*s\.*r\.(?=[a-z0-9])'), 'customer_service representative '),
    (re.compile(r'\bc\.*s\.*r\.*(?!\S)'), 'customer_service representative'),
    (re.compile(r'\bcustomer service rep\.*(?!\S)'), 'customer_service representative'),
    # qa / qc
    (re.compile(r'\bq\.*a\.(?=[a-z0-9])'), 'quality assurance '),
    (re.compile(r'\bq\.*a\.*(?!\S)'), 'quality assurance'),
    (re.compile(r'\bq\.*c\.(?=[a-z0-9])'), 'quality control '),
    (re.compile(r'\bq\.*c\.*(?!\S)'), 'quality control'),
    # database administrator
    (re.compile(r'\bdba\b'), 'database administrator'),
    (re.compile(r'\bdatabase admin\b'), 'database administrator'),
    (re.compile(r'\bdb admin\b'), 'database administrator'),
    # Instrumentation and electrical
    (re.compile(r'\bi/e\b'), 'instrumentation and electrical'),
    # financial service representative
    (re.compile(r'\bfsr\b'), 'financial service representative'),
    (re.compile(r'\bfreelance\b'), 'freelancer'),
    (re.compile(r'\bdesiginer\b'), 'designer'),
    (re.compile(r'\bbiomed\b'), 'biomedical'),
    (re.compile(r'\bgoverenment\b'), 'government'),
    (re.compile(r'\bmachanic\b'), 'mechanic'),
    (re.compile(r'\bbusiness owner\b'), 'owner'),
    (re.compile(r'\br(\s)*&(\s)*d\b'), 'research_development'),
    (re.compile(r'\br and d\b'), 'research_development'),
    (re.compile(r'research/development'), 'research_development'),
    (re.compile(r'(\s)*&(\s)*'), '/'),
    (re.compile(r'(\s)*,(\s)*'), '/'),
    (re.compile(r'(\s)*/(\s)*'), '/'),
    (re.compile(r'(\s)*\|(\s)*'), '/'),
    (re.compile(r'(\s)+and(\s)+'), '/'),
    (re.compile(r'\beditor-in-chief\b'), 'editor_in_chief'),    
    # Remove all parens and there content
    (re.compile(r'\([\w\s\W\S]*\)'), ''),    
    #ESL
    (re.compile(r'^esl$'), 'english_as_a_second_language instructor'),
    (re.compile(r'^e\.*s\.*l\.* instructor$'), 'english_as_a_second_language instructor'),
    (re.compile(r'^e\.*s\.*l\.* teacher$'), 'english_as_a_second_language instructor'),
    (re.compile(r'^english as a second language$'), 'english_as_a_second_language instructor'),
    (re.compile(r'^english as a second language teacher$'), 'english_as_a_second_language instructor'),
    (re.compile(r'^e.s.l. instructor$'), 'english_as_a_second_language instructor'),
    (re.compile(r'^e.s.l. teacher$'), 'english_as_a_second_language instructor'),
    (re.compile(r"\benglish as a second language\b"), "english_as_a_second_language"),
    (re.compile(r"\binternship\b"), "intern"),
    (re.compile(r"\bsummer intern\b"), "intern")
    
    ]

In [8]:
raw_df['function'] = raw_df['function'].str.lower()

In [9]:
trans = {r: rep for r, rep in transforms}

In [10]:
raw_df['transformed'] = raw_df['function']

In [11]:
raw_df['transformed'].replace(trans, regex=True, inplace=True)

In [12]:
raw_df['transformed'].replace({"  ": " "}, regex=True, inplace=True)

In [13]:
raw_df['transformed'] = raw_df['transformed'].str.strip()

In [14]:
raw_df['transformed'].replace({re.compile(r"\bsummer intern\b"): "intern"}, regex=True, inplace=True)

In [17]:
raw_df["_id"] = raw_df["_id"].apply(str)

Remove all user profiles using non ascii characters

In [None]:
raw_df.to_pickle("/data/rali7/Tmp/solimanz/data/pickles/clean_2017_11_28.pkl")

In [117]:
func_counts = raw_df['transformed'].value_counts()

In [49]:
eng = [isEnglish(s) for s in func_counts.index.values]
idx = [i for i, x in enumerate(eng) if x == False]
problematic = [func_counts.index.values[i] for i in idx]

In [17]:
bad_ids = raw_df[raw_df.transformed.isin(problematic)]["_id"].unique()

In [19]:
raw_df = raw_df[~raw_df["_id"].isin(bad_ids)]

In [107]:
matches = [s for s in func_counts.index if 'vice president' in s]

In [114]:
#matches

In [118]:
with open("/data/rali7/Tmp/solimanz/data_viz/top_titles.txt", "w") as f:
    f.write(fmtcols(func_counts.index.values[:550], 4))

In [119]:
top_550 = func_counts[:550]
bad_ids = raw_df[~raw_df.transformed.isin(top_550.index)]["_id"].unique()
all_ids = raw_df["_id"].unique()
dataset_ids = list(set(all_ids) - set(bad_ids))
len(dataset_ids)

120330

In [105]:
func_counts["teaching assistant"]

52208