# Import Libraries

In [2]:
# General system libraries
import os
import sys
from IPython.display import Image, Markdown
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Dataframe libraries
import pandas as pd
from pandas import DataFrame, read_csv

# Number manipulation
import scipy.sparse
from scipy.ndimage.filters import generic_filter
import patsy
import numpy as np

# Plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

# Data type libaries
from datetime import datetime as dt

# File manipulation
import pickle
import pandas.io.sql as pd_sql
from sqlalchemy import create_engine
import psycopg2 as pg
from flatten_json import flatten

# NLP libraries
import wikipedia as wiki
from nltk import word_tokenize, sent_tokenize,FreqDist, pos_tag
from nltk.corpus import stopwords
import gensim as gn
from gensim import corpora, models, similarities
from collections import defaultdict
from six import iteritems
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS
import string
import emoji
import enchant
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer


# Scraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from scraping_functions.tumblr_api import get_client
import pytumblr

# Stats libaries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model, metrics
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import svm, datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.neighbors import KNeighborsClassifier



# Other libaries
import geopy

  """)
2018-06-03 20:01:04,484 : INFO : 'pattern' package not found; tag filters are not available for English


# Import all corpuses df

In [5]:
all_corpuses_df = pd.read_pickle('iteration1_files/all_corpuses_df.pkl')

In [6]:
all_corpuses_df.head(5)

Unnamed: 0,username,text,cleaned_text,stemmed_text,lemmatized_text
19,spaceeblack,Space Boy,space boy,space boy,space boy
22,vongriffis,seen it all before // bring me the horizon,seen it all before bring me the horizon,seen it all befor bring me the horizon,see it all before bring me the horizon
26,vongriffis,$UICIDEBOY$ // Kill Yourself (Part III),uicideboy kill yourself part iii,uicideboy kill yourself part iii,uicideboy kill yourself part iii
30,vongriffis,"I feel like I’m a no one, that’s what they tol...",i feel like im a no one thats what they told me,i feel like im a no one that what they told me,i feel like im a no one thats what they tell me
31,vongriffis,Issues - The Worst Of Them,issues the worst of them,issu the worst of them,issue the worst of them


In [96]:
all_corpuses_df.shape

(8771, 5)

# Define a function to Vectorize the text both ways

In [68]:
def vectorize_both_ways(cleaned_df, text_to_vectorize):
    stop_words = list(STOP_WORDS)
    stop_words.append('a')
    cv = CountVectorizer(stop_words=stop_words)
    tfidf = TfidfVectorizer(stop_words=stop_words)
    corpus = cleaned_df[text_to_vectorize]
    cv_fitted = cv.fit(corpus)
    tfidf_fitted = tfidf.fit(corpus)
    cv_data = cv.fit_transform(corpus)
    tfidf_data = tfidf.fit_transform(corpus)
    return cv_fitted, cv_data, tfidf_fitted, tfidf_data

## Test the function

In [69]:
cv_fitted_test, cv_data_test, tfidf_fitted_test, tfidf_data_test = vectorize_both_ways(all_corpuses_df[:100], 'cleaned_text')

In [70]:
cv_fitted_test

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['yours', 'wherein', 'say', 'had', 'someone', 'elsewhere', 'do', 'go', 'none', 'besides', 'yourselves', 'below', 'therein', 'everything', 'always', 'mine', 'well', 'about', 'beside', 'while', 'noone', 're', 'show', 'her', 'nine', 'does', 'never', 'ca', 'i', 'off', 'if', 'ever', 'whatever'...', 'who', 'an', 'amongst', 'anyhow', 'becomes', 'one', 'but', 'become', 'nevertheless', 'down', 'a'],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [71]:
cv_data_test

<100x438 sparse matrix of type '<class 'numpy.int64'>'
	with 552 stored elements in Compressed Sparse Row format>

In [72]:
tfidf_fitted_test

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['yours', 'wherein', 'say', 'had', 'someone', 'elsewhere', 'do', 'go', 'none', 'besides', 'yourselves', 'below', 'therein', 'everything', 'always', 'mine', 'well', 'about', 'beside', 'while', 'noone', 're', 'show', 'her', 'nine', 'does', 'never', 'ca', 'i', 'off', 'if', 'ever', 'whatever'...', 'who', 'an', 'amongst', 'anyhow', 'becomes', 'one', 'but', 'become', 'nevertheless', 'down', 'a'],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [73]:
tfidf_data_test

<100x438 sparse matrix of type '<class 'numpy.float64'>'
	with 552 stored elements in Compressed Sparse Row format>

# Apply the function

In [74]:
cv_fitted, cv_data, tfidf_fitted, tfidf_data = vectorize_both_ways(all_corpuses_df, 'cleaned_text')

In [75]:
pickle.dump(cv_fitted, open('iteration1_files/cv_fitted', 'wb'))

In [76]:
pickle.dump(cv_data, open('iteration1_files/cv_data', 'wb'))

In [77]:
pickle.dump(tfidf_fitted, open('iteration1_files/tfidf_fitted', 'wb'))

In [78]:
pickle.dump(tfidf_data, open('iteration1_files/tfidf_data', 'wb'))

# Define a function to generate fitted vectorization/model combos and data

In [79]:
def gen_vectorizer_model_combos(cv_fitted, cv_data, tfidf_fitted, tfidf_data, n_topics=5, random_state=30):
    n_topics=n_topics
    random_state=random_state
    
    
    nmf_cv = NMF(n_components=n_topics, random_state=random_state)
    nmf_cv_data = nmf_cv.fit_transform(cv_data)
    
    nmf_tfidf = NMF(n_components=n_topics, random_state=random_state)
    nmf_tfidf_data = nmf_tfidf.fit_transform(tfidf_data)
    
    lsa_cv = TruncatedSVD(n_components=n_topics, random_state=random_state)
    lsa_cv_data = lsa_cv.fit_transform(cv_data)
    
    lsa_tfidf = TruncatedSVD(n_components=n_topics, random_state=random_state)
    lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
    
    lda_cv = LatentDirichletAllocation(n_components=n_topics, random_state=random_state)
    lda_cv_data = lda_cv.fit_transform(cv_data)
    
    lda_tfidf = LatentDirichletAllocation(n_components=n_topics, random_state=random_state)
    lda_tfidf_data = lda_tfidf.fit_transform(tfidf_data)
    
    combo_models_list = [nmf_cv, nmf_tfidf, lsa_cv, lsa_tfidf, lda_cv, lda_tfidf]
    
    return nmf_cv, nmf_cv_data, nmf_tfidf, nmf_tfidf_data, lsa_cv, lsa_cv_data, lsa_tfidf, lsa_tfidf_data, lda_cv, lda_cv_data, lda_tfidf, lda_tfidf_data, combo_models_list
    

In [80]:
nmf_cv, nmf_cv_data, nmf_tfidf, nmf_tfidf_data, lsa_cv, lsa_cv_data, lsa_tfidf, lsa_tfidf_data, lda_cv, lda_cv_data, lda_tfidf, lda_tfidf_data, combo_models_list = gen_vectorizer_model_combos(cv_fitted, cv_data, tfidf_fitted, tfidf_data)



In [81]:
pickle.dump(nmf_cv, open('iteration1_files/nmf_cv.pkl', 'wb'))

In [82]:
pickle.dump(nmf_cv_data, open('iteration1_files/nmf_cv_data.pkl', 'wb'))

In [83]:
pickle.dump(nmf_tfidf, open('iteration1_files/nmf_tfidf.pkl', 'wb'))

In [84]:
pickle.dump(nmf_tfidf_data, open('iteration1_files/nmf_tfidf_data.pkl', 'wb'))

In [85]:
pickle.dump(lsa_cv, open('iteration1_files/lsa_cv.pkl', 'wb'))

In [86]:
pickle.dump(lsa_cv_data, open('iteration1_files/lsa_cv_data.pkl', 'wb'))

In [87]:
pickle.dump(lsa_tfidf, open('iteration1_files/lsa_tfidf.pkl', 'wb'))

In [88]:
pickle.dump(lsa_tfidf_data, open('iteration1_files/lsa_tfidf_data.pkl', 'wb'))

In [89]:
pickle.dump(lda_cv, open('iteration1_files/lda_cv.pkl', 'wb'))

In [90]:
pickle.dump(lda_cv_data, open('iteration1_files/lda_cv_data.pkl', 'wb'))

In [91]:
pickle.dump(lda_tfidf, open('iteration1_files/lda_tfidf.pkl', 'wb'))

In [92]:
pickle.dump(lda_tfidf_data, open('iteration1_files/lda_tfidf_data.pkl', 'wb'))

In [93]:
pickle.dump(combo_models_list, open('iteration1_files/combo_models_list.pkl', 'wb'))

# Define a function to obtain a topics df

In [94]:
def gen_topics_for_one_combo(combo_model, combo_model_name, fitted_vectorizer, num_top_words):
    feature_names = fitted_vectorizer.get_feature_names()
    
    for idx, topic in enumerate(combo_model.components_):
        topic_words = " ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]])
        return "{}_Topic{}: {}".format(combo_model_name, idx+1, topic_words)
        

In [95]:
gen_topics_for_one_combo(lda_tfidf, 'lda_tfidf', tfidf_fitted, 20)

'lda_tfidf_Topic1: weight trying okay httpiglovequotesnet thoughts lets week head kill tips meet wiltedflower relate alienmonster dropneurons yall lose photos psa goal'

In [97]:
gen_topics_for_one_combo(lda_cv, 'lda_cv', cv_fitted, 20)

'lda_cv_Topic1: weight reblog okay stay trying guys lose black world said year try white hard kill head skinny words calories cut'

In [None]:
def compile_topics_df(combo_models_list, cv_fitted, tfidf_fitted, num_top_words):
    for combo_model in combo_models_list:
        gen_topics_for_one_combo(combo_model)
        

In [38]:
def display_topics(model, feature_names, no_top_words):
    for ix, topic in enumerate(model.components_):
        print("Topic ", ix+1)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        

In [100]:
display_topics(lda_tfidf, tfidf_fitted.get_feature_names(), 20)

Topic  1
weight trying okay httpiglovequotesnet thoughts lets week head kill tips meet wiltedflower relate alienmonster dropneurons yall lose photos psa goal
Topic  2
im happy look things follow month good pride blog need today wish sorry dark like love cute thats redheart pretty
Topic  3
cant im dont want love hate life like feel friends people lost way help oh right reblog sad tell ive
Topic  4
weheartit love hey thinspo mood httpswwwinstagramcomthepersonalquotes care black night ig pics white great time kiss sleep fall home use money
Topic  5
want like know dont people time youre im love die fuck day new beautiful edit hard need art stop post


In [41]:
lsa_tfidf_data.get_feature_names()

AttributeError: 'numpy.ndarray' object has no attribute 'get_feature_names'

In [None]:
lsa_tfidf.fe

In [9]:
display_topics(lda,count_vectorizer.get_feature_names(),20)

NameError: name 'lda' is not defined

In [None]:
displ

In [None]:
def get_topics_df(nmf_cv, ):
    