In [1]:
from __future__ import division
from __future__ import print_function
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, auc
from sklearn import cross_validation
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *
from nltk import word_tokenize
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

import re
from HTMLParser import HTMLParser
import datetime
import cPickle as pickle
pd.options.mode.chained_assignment = None  # default='warn'



In [2]:
# Files
all_file = '/Users/lekha/galvanize/capstone/projectRiley/data/withindgroup/all_withindgroup.txt'
tech_file = '/Users/lekha/galvanize/capstone/projectRiley/data/withindgroup/tech_withind.txt'
hadoop_all = '/Users/lekha/galvanize/capstone/projectRiley/data/withindgroup/all_hadoop.txt'
hadoop_tech = '/Users/lekha/galvanize/capstone/projectRiley/data/withindgroup/tech_hadoop.txt'

In [None]:
dfall1 = pd.read_csv(all_file, sep="|")
dftech1 = pd.read_csv(tech_file, sep="|")
dfh = pd.read_csv(hadoop_all, sep="|")
dfht = pd.read_csv(hadoop_tech, sep="|")

dftech_all = pd.concat([dftech1, pd.read_csv(hadoop_tech, sep="|")], axis=0)
df_all2 = pd.concat([dfall1, pd.read_csv(hadoop_all, sep="|")], axis=0)

In [None]:
f_vocab = str(f_vocab)


In [3]:
all_file = '/Users/lekha/galvanize/capstone/projectRiley/data/withindgroup/all2.txt'
tech_file = '/Users/lekha/galvanize/capstone/projectRiley/data/withindgroup/tech_all2.txt'

df1 = pd.read_csv(all_file, sep="|")
df2 = pd.read_csv(tech_file, sep="|")
df = df1.copy()

In [4]:
# Functions
def missing(df):
    if df.summary == 'missing' or df.num_tokens == 0:
        return 1
    else:
        return 0    

    
def lenx(mystr):
    return len(mystr.split())


def avgchrs(mytokens):
    tw = len(mytokens)    
    num_chars = 0
    for word in mytokens:
        num_chars += len(word)        
    return num_chars/tw


def remove_digits(mystr):
    '''
    INPUT: list of tokens 
    OUTPUT: list of tokens with digits removed
    '''
    return [word for word in mystr if not word.isdigit()]


stemmer = SnowballStemmer("english")
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed


def tokenize(text):
    tokens = word_tokenize(text)
    row = remove_digits(tokens)
    stems = stem_tokens(row, stemmer)
    return stems


def tokenize_no_stem(text):
    tokens = word_tokenize(text)
    row = remove_digits(tokens)
    stems = stem_tokens(row, stemmer)
    return tokens 

def preprocess_df(df):
    # Feature Engineering before running the prediction code
    df['class'] = np.ones(len(df))
    df['class'] = df['gender_forced'].apply(lambda x: 0 if x == 'female' else 1)

    df['summ_tokens'] = df['summary'].apply(lambda x: nltk.word_tokenize(str(x)))
    df['num_tokens'] = df['summ_tokens'].apply(lambda x: len(x))

    # Add feature for missing summary
    df['summ_missing'] = df.apply(missing, axis = 1)

    # Only include rows with summaries
    df = df[df['summ_missing'] == 0]

    # Some Nan rows refuse to go without this
    df = df[pd.notnull(df['summary'])]

    print ("Length of DF after removing rows with missing Summaries:\n")
    print (len(df))

    df['avg_len'] = df['summ_tokens'].apply(lambda x: avgchrs(x))

    # lexical diversity = number of unique tokens / total number of tokens
    df['lex_diversity'] = df['summ_tokens'].apply(lambda x: len(set(x))/len(x))
    
    return df

In [None]:
# Print the top 20 words and weights for each topic
def print_top_weights(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(H):
        print("Topic #%d:" % topic_idx)
        top_weights = topic.argsort()[:-n_top_words - 1:-1]
        #print (top_weights)
        print (" ".join(["{0}, {1}".format(feature_names[x], topic[x]) for x in top_weights]))
    print()
    

# Print the most probable topic for each document/profile
def profiles_by_topic(W):
    top_idx = np.zeros([W.shape[0],1], dtype=int)
    for row_idx, row in enumerate(W):
        topic_idx = row.argsort()[-1]  
        top_idx[row_idx] = topic_idx
    topics, counts = np.unique(top_idx, return_counts=True)
    print (np.asarray((topics, counts)).T)
    return top_idx

           
def run_topic_model_tfidf(X, stopwords):
    print ("Bag of Words, Tfidf\n")
    
    vectorizer = TfidfVectorizer(analyzer = 'word', tokenizer = tokenize_no_stem, stop_words = stopwords, max_features = 5000)
    word_frequencies = vectorizer.fit_transform(X)

    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    word_frequencies = word_frequencies.toarray()
    feature_names = vectorizer.get_feature_names()

    # NMF Model to determine topics
    nmf_model = NMF(n_components=7, init='random', random_state=0)
    W = nmf_model.fit_transform(word_frequencies)
    # H: Topics * Words
    H = nmf_model.components_
    print (H.shape, W.shape)
    
    n_top_words = 20
    # Print weights and topics for the top 20 topics
    for topic_idx, topic in enumerate(H):
        print("Topic #%d:" % topic_idx)
        top_weights = topic.argsort()[:-n_top_words - 1:-1]
        print (" ".join(["{0}".format(feature_names[x]) for x in top_weights]))
    print()
    
    # Highest Weighted Topic for each profile
    print ("Number of profiles by topic\n")
    top_idx = profiles_by_topic(W) 
    
    return nmf_model, W, top_idx


#tokenize: function that is stemming using SnowballStemmer
#stopwords: custom stop words
def run_topic_model_countv(X, stopwords):
    print ("Bag of words, Count Vectorizer...\n")
    vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = tokenize_no_stem,    \
                                 preprocessor = None, \
                                 stop_words = stopwords,   \
                                 max_df = 0.7, \
                                 min_df = 5, \
                                 max_features = 5000) 


    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    word_frequencies = vectorizer.fit_transform(X)

    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    word_frequencies = word_frequencies.toarray()
    feature_names = vectorizer.get_feature_names()

    # NMF Model to determine topics
    nmf_model = NMF(n_components=4, init='random', random_state=0)
    W = nmf_model.fit_transform(word_frequencies)
    # H: Topics * Words
    H = nmf_model.components_
    print (H.shape, W.shape)
    
    n_top_words = 20
    # Print weights and topics for the top 20 topics
    for topic_idx, topic in enumerate(H):
        print("Topic #%d:" % topic_idx)
        top_weights = topic.argsort()[:-n_top_words - 1:-1]
        #print (" ".join(["{0}".format(feature_names[x]) for x in top_weights]))
        print (" ".join(["{0}, {1}".format(feature_names[x], topic[x]) for x in top_weights]))
    print()
    
    print ("Number of profiles by Topic\n")
    top_idx = profiles_by_topic(W)
    
    #print ("Example Profiles by Topic\n")
    
    return nmf_model, W, top_idx

In [5]:
def topic_models_by_df(df):
    df = preprocess_df(df)
    females = df[df['gender'] == 'female']
    males = df[df['gender'] == 'male']

    X = df['summary']
    y = np.array(df['class'])
    print ("Topics for ALL profiles: TFIDF\n")
    nmf_model, W, top_idx = run_topic_model_tfidf(X, stopwords)
    
    print ("Topics for ALL profiles: Count Vectorizer\n")
    nmf_model, W, top_idx = run_topic_model_countv(X, stopwords)

    X = females['summary']
    y = np.array(females['class'])
    print ("Topics for FEMALE profiles: TFIDF\n")
    nmf_model, W, top_idx = run_topic_model_tfidf(X, stopwords)
    print ("Topics for FEMALE profiles: Count Vectorizer\n")
    nmf_model, W, top_idx = run_topic_model_countv(X, stopwords)

    X = males['summary']
    y = np.array(males['class'])
    print ("Topics for MALE profiles: TFIDF\n")
    nmf_model, W, top_idx = run_topic_model_tfidf(X, stopwords)
    print ("Topics for MALE profiles: Count Vectorizer\n")
    nmf_model, W, top_idx = run_topic_model_countv(X, stopwords)

In [7]:
df= preprocess_df(df)
topic_models_by_df(df)

Length of DF after removing rows with missing Summaries:

26850
Length of DF after removing rows with missing Summaries:

26850
Topics for ALL profiles: TFIDF



NameError: global name 'run_topic_model_tfidf' is not defined