# Import Libraries

In [146]:
# General system libraries
import os
import sys
from IPython.display import Image, Markdown
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Dataframe libraries
import pandas as pd
from pandas import DataFrame, read_csv

# Number manipulation
import scipy.sparse
from scipy.ndimage.filters import generic_filter
import patsy
import numpy as np

# Plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

# Data type libaries
from datetime import datetime as dt

# File manipulation
import pickle
import pandas.io.sql as pd_sql
from sqlalchemy import create_engine
import psycopg2 as pg
from flatten_json import flatten

# NLP libraries
import wikipedia as wiki
from nltk import word_tokenize, sent_tokenize,FreqDist, pos_tag
from nltk.corpus import stopwords
import gensim as gn
from gensim import corpora, models, similarities
from collections import defaultdict
from six import iteritems
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS
import string
import emoji
import enchant
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer


# Scraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from scraping_functions.tumblr_api import get_client
import pytumblr

# Stats libaries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model, metrics
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import svm, datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.neighbors import KNeighborsClassifier



# Other libaries
import geopy

# Import all corpuses df

In [165]:
all_corpuses_df = pd.read_pickle('iteration1_files/all_corpuses_df.pkl')

In [166]:
all_corpuses_df.head(5)

Unnamed: 0,username,text,cleaned_text,stemmed_text,lemmatized_text
19,spaceeblack,Space Boy,space boy,space boy,space boy
22,vongriffis,seen it all before // bring me the horizon,seen it all before bring me the horizon,seen it all befor bring me the horizon,see it all before bring me the horizon
26,vongriffis,$UICIDEBOY$ // Kill Yourself (Part III),uicideboy kill yourself part iii,uicideboy kill yourself part iii,uicideboy kill yourself part iii
30,vongriffis,"I feel like I’m a no one, that’s what they tol...",i feel like im a no one thats what they told me,i feel like im a no one that what they told me,i feel like im a no one thats what they tell me
31,vongriffis,Issues - The Worst Of Them,issues the worst of them,issu the worst of them,issue the worst of them


In [167]:
all_corpuses_df.shape

(8771, 5)

In [349]:
all_corpuses_df.reset_index(inplace=True)

# Define a function to Vectorize the text both ways

In [168]:
def vectorize_both_ways(cleaned_df, text_to_vectorize):
    stop_words = list(STOP_WORDS)
    stop_words.append('a')
    cv = CountVectorizer(stop_words=stop_words)
    tfidf = TfidfVectorizer(stop_words=stop_words)
    corpus = cleaned_df[text_to_vectorize]
    cv_fitted = cv.fit(corpus)
    tfidf_fitted = tfidf.fit(corpus)
    cv_data = cv.fit_transform(corpus)
    tfidf_data = tfidf.fit_transform(corpus)
    return cv_fitted, cv_data, tfidf_fitted, tfidf_data

## Test the function

In [169]:
cv_fitted_test, cv_data_test, tfidf_fitted_test, tfidf_data_test = vectorize_both_ways(all_corpuses_df[:100], 'cleaned_text')

In [170]:
cv_fitted_test

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['yours', 'wherein', 'say', 'had', 'someone', 'elsewhere', 'do', 'go', 'none', 'besides', 'yourselves', 'below', 'therein', 'everything', 'always', 'mine', 'well', 'about', 'beside', 'while', 'noone', 're', 'show', 'her', 'nine', 'does', 'never', 'ca', 'i', 'off', 'if', 'ever', 'whatever'...', 'who', 'an', 'amongst', 'anyhow', 'becomes', 'one', 'but', 'become', 'nevertheless', 'down', 'a'],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [171]:
cv_data_test

<100x438 sparse matrix of type '<class 'numpy.int64'>'
	with 552 stored elements in Compressed Sparse Row format>

In [172]:
tfidf_fitted_test

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['yours', 'wherein', 'say', 'had', 'someone', 'elsewhere', 'do', 'go', 'none', 'besides', 'yourselves', 'below', 'therein', 'everything', 'always', 'mine', 'well', 'about', 'beside', 'while', 'noone', 're', 'show', 'her', 'nine', 'does', 'never', 'ca', 'i', 'off', 'if', 'ever', 'whatever'...', 'who', 'an', 'amongst', 'anyhow', 'becomes', 'one', 'but', 'become', 'nevertheless', 'down', 'a'],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [173]:
tfidf_data_test

<100x438 sparse matrix of type '<class 'numpy.float64'>'
	with 552 stored elements in Compressed Sparse Row format>

# Apply the function

In [174]:
cv_fitted, cv_data, tfidf_fitted, tfidf_data = vectorize_both_ways(all_corpuses_df, 'cleaned_text')

In [125]:
pickle.dump(cv_fitted, open('iteration1_files/epoch1/cv_fitted', 'wb'))

In [126]:
pickle.dump(cv_data, open('iteration1_files/epoch1/cv_data', 'wb'))

In [127]:
pickle.dump(tfidf_fitted, open('iteration1_files/epoch1/tfidf_fitted', 'wb'))

In [128]:
pickle.dump(tfidf_data, open('iteration1_files/epoch1/tfidf_data', 'wb'))

# Define a function to generate fitted vectorization/model combos and data

In [79]:
def gen_vectorizer_model_combos(cv_fitted, cv_data, tfidf_fitted, tfidf_data, n_topics=5, random_state=30):
    n_topics=n_topics
    random_state=random_state
    
    
    nmf_cv = NMF(n_components=n_topics, random_state=random_state)
    nmf_cv_data = nmf_cv.fit_transform(cv_data)
    
    nmf_tfidf = NMF(n_components=n_topics, random_state=random_state)
    nmf_tfidf_data = nmf_tfidf.fit_transform(tfidf_data)
    
    lsa_cv = TruncatedSVD(n_components=n_topics, random_state=random_state)
    lsa_cv_data = lsa_cv.fit_transform(cv_data)
    
    lsa_tfidf = TruncatedSVD(n_components=n_topics, random_state=random_state)
    lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
    
    lda_cv = LatentDirichletAllocation(n_components=n_topics, random_state=random_state)
    lda_cv_data = lda_cv.fit_transform(cv_data)
    
    lda_tfidf = LatentDirichletAllocation(n_components=n_topics, random_state=random_state)
    lda_tfidf_data = lda_tfidf.fit_transform(tfidf_data)
    
    combo_models_list = [nmf_cv, nmf_tfidf, lsa_cv, lsa_tfidf, lda_cv, lda_tfidf]
    
    return nmf_cv, nmf_cv_data, nmf_tfidf, nmf_tfidf_data, lsa_cv, lsa_cv_data, lsa_tfidf, lsa_tfidf_data, lda_cv, lda_cv_data, lda_tfidf, lda_tfidf_data, combo_models_list
    

In [80]:
nmf_cv, nmf_cv_data, nmf_tfidf, nmf_tfidf_data, lsa_cv, lsa_cv_data, lsa_tfidf, lsa_tfidf_data, lda_cv, lda_cv_data, lda_tfidf, lda_tfidf_data, combo_models_list = gen_vectorizer_model_combos(cv_fitted, cv_data, tfidf_fitted, tfidf_data)



In [129]:
pickle.dump(nmf_cv, open('iteration1_files/epoch1/nmf_cv.pkl', 'wb'))

In [130]:
pickle.dump(nmf_cv_data, open('iteration1_files/epoch1/nmf_cv_data.pkl', 'wb'))

In [131]:
pickle.dump(nmf_tfidf, open('iteration1_files/epoch1/nmf_tfidf.pkl', 'wb'))

In [132]:
pickle.dump(nmf_tfidf_data, open('iteration1_files/epoch1/nmf_tfidf_data.pkl', 'wb'))

In [133]:
pickle.dump(lsa_cv, open('iteration1_files/epoch1/lsa_cv.pkl', 'wb'))

In [134]:
pickle.dump(lsa_cv_data, open('iteration1_files/epoch1/lsa_cv_data.pkl', 'wb'))

In [135]:
pickle.dump(lsa_tfidf, open('iteration1_files/epoch1/lsa_tfidf.pkl', 'wb'))

In [136]:
pickle.dump(lsa_tfidf_data, open('iteration1_files/epoch1/lsa_tfidf_data.pkl', 'wb'))

In [137]:
pickle.dump(lda_cv, open('iteration1_files/epoch1/lda_cv.pkl', 'wb'))

In [138]:
pickle.dump(lda_cv_data, open('iteration1_files/epoch1/lda_cv_data.pkl', 'wb'))

In [139]:
pickle.dump(lda_tfidf, open('iteration1_files/epoch1/lda_tfidf.pkl', 'wb'))

In [140]:
pickle.dump(lda_tfidf_data, open('iteration1_files/epoch1/lda_tfidf_data.pkl', 'wb'))

In [141]:
pickle.dump(combo_models_list, open('iteration1_files/epoch1/combo_models_list.pkl', 'wb'))

# Define a function to obtain a topics df

In [311]:
def gen_topics_for_one_combo(combo_model, combo_model_name, fitted_vectorizer, num_top_words):
    feature_names = fitted_vectorizer.get_feature_names()
    combo_topics = []
    for idx, topic in enumerate(combo_model.components_):
        topic_words = " ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]])
        combo_topics.append("{}_topic{}: {}".format(combo_model_name, idx+1, topic_words))
    return combo_topics

In [312]:
gen_topics_for_one_combo(lda_tfidf, 'lda_tfidf', tfidf_fitted, 20)

['lda_tfidf_topic1: weight trying okay httpiglovequotesnet thoughts lets week head kill tips meet wiltedflower relate alienmonster dropneurons yall lose photos psa goal',
 'lda_tfidf_topic2: im happy look things follow month good pride blog need today wish sorry dark like love cute thats redheart pretty',
 'lda_tfidf_topic3: cant im dont want love hate life like feel friends people lost way help oh right reblog sad tell ive',
 'lda_tfidf_topic4: weheartit love hey thinspo mood httpswwwinstagramcomthepersonalquotes care black night ig pics white great time kiss sleep fall home use money',
 'lda_tfidf_topic5: want like know dont people time youre im love die fuck day new beautiful edit hard need art stop post']

In [313]:
def compile_topics_df(combo_models_cv, combo_models_tfidf, cv_fitted, tfidf_fitted, num_top_words):
    all_combos_topics = []
    combo_names_cv = ['nmf_cv', 'lsa_cv', 'lda_cv']
    combo_names_tfidf =  ['nmf_tfidf', 'lsa_tfidf', 'lda_tfidf']

    for idx, combo_model in enumerate(combo_models_cv):
        combo_topics = gen_topics_for_one_combo(combo_model, combo_names_cv[idx], cv_fitted, num_top_words)
        all_combos_topics.append(combo_topics)
    for idx, combo_model in enumerate(combo_models_tfidf):
        combo_topics = gen_topics_for_one_combo(combo_model, combo_names_tfidf[idx], tfidf_fitted, num_top_words)
        all_combos_topics.append(combo_topics)

    combo_topics_df = pd.DataFrame(all_combos_topics)
    return combo_topics_df

# Obtain topic names df

In [314]:
selfharmm_topic_names_df = compile_topics_df([nmf_cv, lsa_cv, lda_cv], [nmf_tfidf, lsa_tfidf, lda_tfidf], cv_fitted, tfidf_fitted, 20)

In [315]:
selfharmm_topic_names_df

Unnamed: 0,0,1,2,3,4
0,nmf_cv_topic1: want die people ask stop need f...,nmf_cv_topic2: im going time feel sorry gonna ...,nmf_cv_topic3: like feel people things cant lo...,nmf_cv_topic4: love blog hate think fall art h...,nmf_cv_topic5: dont know think life youre feel...
1,lsa_cv_topic1: like im dont want love feel kno...,lsa_cv_topic2: want dont love like know need d...,lsa_cv_topic3: like feel people feels felt sou...,lsa_cv_topic4: love blog like hate fall art ho...,lsa_cv_topic5: dont know youre care life think...
2,lda_cv_topic1: weight reblog okay stay trying ...,lda_cv_topic2: feel time like need look ive bl...,lda_cv_topic3: im love like think cant people ...,lda_cv_topic4: hey thinspo friend love weheart...,lda_cv_topic5: want dont know life like people...
3,nmf_tfidf_topic1: weheartit east eden eve kick...,nmf_tfidf_topic2: want dont know die need thin...,nmf_tfidf_topic3: love blog fall think hate ar...,nmf_tfidf_topic4: im sorry tired going think s...,nmf_tfidf_topic5: like feel people things rebl...
4,lsa_tfidf_topic1: weheartit east eden httpiglo...,lsa_tfidf_topic2: love im want dont like know ...,lsa_tfidf_topic3: love blog fall chicks mom ar...,lsa_tfidf_topic4: im sorry tired sad gonna goi...,lsa_tfidf_topic5: like feel people things your...
5,lda_tfidf_topic1: weight trying okay httpiglov...,lda_tfidf_topic2: im happy look things follow ...,lda_tfidf_topic3: cant im dont want love hate ...,lda_tfidf_topic4: weheartit love hey thinspo m...,lda_tfidf_topic5: want like know dont people t...


In [316]:
pickle.dump(selfharmm_topic_names_df, open('iteration1_files/epoch1/selfharmm_topic_names_df', 'wb'))

# Review topic names

### NMF_CV

In [317]:
for row in selfharmm_topic_names_df.iloc[0]:
    print(row)

nmf_cv_topic1: want die people ask stop need fuck reblog tell eat skinny women think hard weight little things talk lose okay
nmf_cv_topic2: im going time feel sorry gonna people cant fucking think good tired day life way trying happy eat today ive
nmf_cv_topic3: like feel people things cant look need youre weight shit feels felt life feeling time anxiety right thing lot little
nmf_cv_topic4: love blog hate think fall art hope ive got things people need person thank youre way look life didnt little
nmf_cv_topic5: dont know think life youre feel care time eat cant need people fuck right day hey anymore person going ive


### LSA_CV

In [318]:
for row in selfharmm_topic_names_df.iloc[1]:
    print(row)

lsa_cv_topic1: like im dont want love feel know people time life think cant things youre need going day eat right way
lsa_cv_topic2: want dont love like know need die people youre look reblog fuck life stop think things feel fall talk friends
lsa_cv_topic3: like feel people feels felt sound anxiety actually stomach drawing follow making loose shit look feeling wont extra dream alive
lsa_cv_topic4: love blog like hate fall art hope got thank person ive draw found find things look looking writing id didnt
lsa_cv_topic5: dont know youre care life think time feel cant going person hey let good ignore report anymore wanna pro day


### LDA_CV

In [319]:
for row in selfharmm_topic_names_df.iloc[2]:
    print(row)

lda_cv_topic1: weight reblog okay stay trying guys lose black world said year try white hard kill head skinny words calories cut
lda_cv_topic2: feel time like need look ive blog things good im stop love month pride best thats wanna long help follow
lda_cv_topic3: im love like think cant people happy day got fucking hate little eat going today body tell days youre sad
lda_cv_topic4: hey thinspo friend love weheartit right didnt thank care sorry day mood shit honestly dont looking left face great dead
lda_cv_topic5: want dont know life like people new youre person die fuck heart post live real makes wanted friends girl gonna


### NMF_TFIDF

In [320]:
for row in selfharmm_topic_names_df.iloc[3]:
    print(row)

nmf_tfidf_topic1: weheartit east eden eve kicked garden dönücez shed lil ocd happy friend finally pride wallerbridge costumes military outfits villanelle building
nmf_tfidf_topic2: want dont know die need think look fuck life care youre anymore pro ignore people eat report hey tell way
nmf_tfidf_topic3: love blog fall think hate art hope fucking need person chicks thank mom looking find draw ive let followers didnt
nmf_tfidf_topic4: im sorry tired going think sad gonna fucking time feel shit way cant good eat trying ok mind happy life
nmf_tfidf_topic5: like feel people things reblog cant youre need look good time happy month weight feels right day better life wish


### LSA_TFIDF

In [321]:
for row in selfharmm_topic_names_df.iloc[4]:
    print(row)

lsa_tfidf_topic1: weheartit east eden httpiglovequotesnet youre dont feel time need fuck good cant mood bad stop click beautiful blog world wanna
lsa_tfidf_topic2: love im want dont like know feel people think need blog life cant youre time look fucking things happy way
lsa_tfidf_topic3: love blog fall chicks mom art fell simon looking madly draw teach hope party redheart writing suffer clear woman edit
lsa_tfidf_topic4: im sorry tired sad gonna going love shit mood ok hungry ready good screaming ugw fucking trying proud lonely craving
lsa_tfidf_topic5: like feel people things youre cant good reblog feels need look follow better month time hate post blog looks wish


### LDA_TFIDF

In [322]:
for row in selfharmm_topic_names_df.iloc[5]:
    print(row)

lda_tfidf_topic1: weight trying okay httpiglovequotesnet thoughts lets week head kill tips meet wiltedflower relate alienmonster dropneurons yall lose photos psa goal
lda_tfidf_topic2: im happy look things follow month good pride blog need today wish sorry dark like love cute thats redheart pretty
lda_tfidf_topic3: cant im dont want love hate life like feel friends people lost way help oh right reblog sad tell ive
lda_tfidf_topic4: weheartit love hey thinspo mood httpswwwinstagramcomthepersonalquotes care black night ig pics white great time kiss sleep fall home use money
lda_tfidf_topic5: want like know dont people time youre im love die fuck day new beautiful edit hard need art stop post


# Create a dictionary out of the topic names

In [324]:
selfharmmm_dictionary = {'nmf_cv_topic1': 'eating', 'nmf_cv_topic2': 'positive', 'nmf_cv_topic3': 'depression', 'nmf_cv_topic4': 'art', 'nmf_cv_topic5': 'depression',
                        'lsa_cv_topic1': 'depression', 'lsa_cv_topic2': 'depression', 'lsa_cv_topic3': 'depression', 'lsa_cv_topic4': 'art', 'lsa_cv_topic5': 'depression',
                        'lda_cv_topic1': 'eating', 'lda_cv_topic2': 'spam', 'lda_cv_topic3': 'depression', 'lda_cv_topic4': 'eating', 'lda_cv_topic5': 'depression',
                        'nmf_tfidf_topic1': 'spam', 'nmf_tfidf_topic2': 'eating', 'nmf_tfidf_topic3': 'spam', 'nmf_tfidf_topic4': 'depression', 'nmf_tfidf_topic5': 'eating',
                        'lsa_tfidf_topic1': 'spam', 'lsa_tfidf_topic2': 'spam', 'lsa_tfidf_topic3': 'porn', 'lsa_tfidf_topic4': 'eating', 'lsa_tfidf_topic5': 'spam',
                        'lda_tfidf_topic1': 'spam', 'lda_tfidf_topic2': 'spam', 'lda_tfidf_topic3': 'depression', 'lda_tfidf_topic4': 'porn', 'lda_tfidf_topic5': 'art'}

# Define a function to plot the LSA document distribution with SVD

In [179]:
def plot_docs_svd(lsa_cv_data, lsa_tfidf_data, cleaned_df, text_plotting):
    data = cleaned_df[text_plotting]
    svd_cv = TruncatedSVD(n_components=2)
    documents_2d_cv = svd.fit_transform(lsa_cv_data)
    df_cv = pd.DataFrame(columns=['x', 'y', 'document'])
    df_cv['x'], df_cv['y'], df_cv['document'] = documents_2d_cv[:,0], documents_2d_cv[:,1], range(len(data))
 
    source_cv = ColumnDataSource(ColumnDataSource.from_df(df_cv))

    plot = figure(plot_width=600, plot_height=600, title='SVD Count Vectorized Doc Distribution')
    plot.circle("x", "y", size=12, source=source_cv, line_color="black", fill_alpha=0.8)

    show(plot, notebook_handle=True)
    
    svd_tfidf = TruncatedSVD(n_components=2)
    documents_2d_tfidf = svd.fit_transform(lsa_tfidf_data)
    df_tfidf = pd.DataFrame(columns=['x', 'y', 'document'])
    df_tfidf['x'], df_tfidf['y'], df_tfidf['document'] = documents_2d_tfidf[:,0], documents_2d_tfidf[:,1], range(len(data))
 
    source_tfidf = ColumnDataSource(ColumnDataSource.from_df(df_tfidf))

 
    plot = figure(plot_width=600, plot_height=600, title='SVD TFIDF Vectorized Doc Distribution')
    plot.circle("x", "y", size=12, source=source_tfidf, line_color="black", fill_alpha=0.8)

    show(plot, notebook_handle=True)
    

# Obtain the SVD visualizations

In [180]:
plot_docs_svd(lsa_cv_data, lsa_tfidf_data, all_corpuses_df, 'cleaned_text')

# Define a function to plot the LSA word distributions with SVD

In [163]:
def plot_words_svd(cv_fitted, lsa_cv_data, tfidf_fitted, lsa_tfidf_data, cleaned_df, text_plotting):
    svd_cv = TruncatedSVD(n_components=2)
    words_2d_cv = svd_cv.fit_transform(lsa_cv_data.T)

    df_cv = pd.DataFrame(columns=['x', 'y', 'word'])
    df_cv['x'], df_cv['y'], df_cv['word'] = words_2d_cv[:,0], words_2d_cv[:,1], cv_fitted.get_feature_names()

    source_cv = ColumnDataSource(ColumnDataSource.from_df(df_cv))

    plot = figure(plot_width=600, plot_height=600)
    plot.circle("x", "y", size=12, source=source_cv, line_color="black", fill_alpha=0.8)
    plot.add_layout(labels)
    show(plot, notebook_handle=True)
    
    svd_tfidf = TruncatedSVD(n_components=2)
    words_2d_tfidf = svd_tfidf.fit_transform(lsa_tfidf_data.T)

    df_tfidf = pd.DataFrame(columns=['x', 'y', 'word'])
    df_tfidf['x'], df_tfidf['y'], df_tfidf['word'] = words_2d_tfidf[:,0], words_2d_tfidf[:,1], tfidf_fitted.get_feature_names()

    source_tfidf = ColumnDataSource(ColumnDataSource.from_df(df_tfidf))

    plot = figure(plot_width=600, plot_height=600)
    plot.circle("x", "y", size=12, source=source_tfidf, line_color="black", fill_alpha=0.8)
    plot.add_layout(labels)
    show(plot, notebook_handle=True)    

In [181]:
plot_words_svd(cv_fitted, lsa_cv_data, tfidf_fitted, lsa_tfidf_data, all_corpuses_df, 'cleaned_text')

ValueError: Length of values does not match length of index

# Define function to get a dataframe per combo

In [272]:
def gen_df_per_combo(nmf_cv_data, nmf_tfidf_data, lsa_cv_data, lsa_tfidf_data, lda_cv_data, 
                     lda_tfidf_data, num_topics):
    
    # NMF_CV
    nmf_cv_columns = []
    for num in range(1, num_topics+1):
        nmf_cv_columns.append('nmf_cv_topic{}'.format(num))
    nmf_cv_df = pd.DataFrame(nmf_cv_data, columns=nmf_cv_columns)
    nmf_cv_df['nmf_cv_sum'] = nmf_cv_df.sum(axis=1)
    
    # NMF_TFIDF
    nmf_tfidf_columns = []
    for num in range(1, num_topics+1):
        nmf_tfidf_columns.append('nmf_tfidf_topic{}'.format(num))
    nmf_tfidf_df = pd.DataFrame(nmf_tfidf_data, columns=nmf_tfidf_columns)
    nmf_tfidf_df['nmf_tfidf_sum'] = nmf_tfidf_df.sum(axis=1)
    
    # LSA_CV
    lsa_cv_columns = []
    for num in range(1, num_topics+1):
        lsa_cv_columns.append('lsa_cv_topic{}'.format(num))    
    lsa_cv_df = pd.DataFrame(lsa_cv_data, columns=lsa_cv_columns)
    lsa_cv_df['lsa_cv_sum'] = lsa_cv_df.sum(axis=1)
    
    # LSA_TFIDF
    lsa_tfidf_columns = []
    for num in range(1, num_topics+1):
        lsa_tfidf_columns.append('lsa_tfidf_topic{}'.format(num))
    lsa_tfidf_df = pd.DataFrame(lsa_tfidf_data, columns=lsa_tfidf_columns)
    lsa_tfidf_df['lsa_tfidf_sum'] = lsa_tfidf_df.sum(axis=1)
    
    # LDA_CV
    lda_cv_columns = []
    for num in range(1, num_topics+1):
        lda_cv_columns.append('lda_cv_topic{}'.format(num))
    lda_cv_df = pd.DataFrame(lda_cv_data, columns=lda_cv_columns)
    lda_cv_df['lda_cv_sum'] = lda_cv_df.sum(axis=1)
    
    # LDA_TFIDF
    lda_tfidf_columns = []
    for num in range(1, num_topics+1):
        lda_tfidf_columns.append('lda_tfidf_topic{}'.format(num))
    lda_tfidf_df = pd.DataFrame(lda_tfidf_data, columns=lda_tfidf_columns)
    lda_tfidf_df['lda_tfidf_sum'] = lda_tfidf_df.sum(axis=1)
    
    
    return nmf_cv_df, nmf_tfidf_df, lsa_cv_df, lsa_tfidf_df, lda_cv_df, lda_tfidf_df

# Obtain dataframe per combo

In [273]:
nmf_cv_df, nmf_tfidf_df, lsa_cv_df, lsa_tfidf_df, lda_cv_df, lda_tfidf_df = gen_df_per_combo(nmf_cv_data, nmf_tfidf_data, lsa_cv_data, lsa_tfidf_data, lda_cv_data, lda_tfidf_data, 5)


In [278]:
nmf_cv_df.head(2)

Unnamed: 0,nmf_cv_topic1,nmf_cv_topic2,nmf_cv_topic3,nmf_cv_topic4,nmf_cv_topic5,nmf_cv_sum
0,0.001328,5.9e-05,0.001219,0.003068,1.9e-05,0.005694
1,0.000208,0.0,0.001079,0.001256,0.001779,0.004322


In [277]:
nmf_tfidf_df.head(2)

Unnamed: 0,nmf_tfidf_topic1,nmf_tfidf_topic2,nmf_tfidf_topic3,nmf_tfidf_topic4,nmf_tfidf_topic5,nmf_tfidf_sum
0,0.0,0.000983,0.004492,0.0,0.004111,0.009586
1,0.0,0.001017,0.001631,0.0,0.00227,0.004918


In [279]:
lsa_cv_df.head(2)

Unnamed: 0,lsa_cv_topic1,lsa_cv_topic2,lsa_cv_topic3,lsa_cv_topic4,lsa_cv_topic5,lsa_cv_sum
0,0.010605,0.007349,-0.002504,0.008991,-0.006624,0.017817
1,0.007639,0.006021,7.3e-05,0.00324,0.004887,0.02186


In [280]:
lsa_tfidf_df.head(2)

Unnamed: 0,lsa_tfidf_topic1,lsa_tfidf_topic2,lsa_tfidf_topic3,lsa_tfidf_topic4,lsa_tfidf_topic5,lsa_tfidf_sum
0,8e-06,0.012497,0.005914,-0.004506,0.006463,0.020375
1,-1e-06,0.006829,0.000862,-0.003128,0.004275,0.008836


In [281]:
lda_cv_df.head(2)

Unnamed: 0,lda_cv_topic1,lda_cv_topic2,lda_cv_topic3,lda_cv_topic4,lda_cv_topic5,lda_cv_sum
0,0.066667,0.066667,0.733332,0.066667,0.066667,1.0
1,0.550164,0.299833,0.050001,0.050001,0.050001,1.0


In [282]:
lda_tfidf_df.head(2)

Unnamed: 0,lda_tfidf_topic1,lda_tfidf_topic2,lda_tfidf_topic3,lda_tfidf_topic4,lda_tfidf_topic5,lda_tfidf_sum
0,0.082857,0.082857,0.668572,0.082857,0.082857,1.0
1,0.501987,0.27833,0.073227,0.073229,0.073227,1.0


In [283]:
pickle.dump(nmf_cv_df, open('iteration1_files/epoch1/nmf_cv_df.pkl', 'wb'))

In [284]:
pickle.dump(nmf_tfidf_df, open('iteration1_files/epoch1/nmf_tfidf_df.pkl', 'wb'))

In [285]:
pickle.dump(lsa_cv_df, open('iteration1_files/epoch1/lsa_cv_df.pkl', 'wb'))

In [286]:
pickle.dump(lsa_tfidf_df, open('iteration1_files/epoch1/lsa_tfidf_df.pkl', 'wb'))

In [287]:
pickle.dump(lda_cv_df, open('iteration1_files/epoch1/lda_cv_df.pkl', 'wb'))

In [288]:
pickle.dump(lda_tfidf_df, open('iteration1_files/epoch1/lda_tfidf_df.pkl', 'wb'))

# Define functions to compile a df to compare the combos and find the resulting topic

In [326]:
def map_topic_names(compiled_combo_df, max_topic_type, topics_dict):
    all_topic_names = []
    for topic in compiled_combo_df[max_topic_type]:
        topic_name = topics_dict[topic]
        all_topic_names.append(topic_name)
    return all_topic_names

In [357]:
def compile_combo_dfs(cleaned_df, text_used, topics_dict, nmf_cv_df, nmf_tfidf_df, lsa_cv_df, lsa_tfidf_df, lda_cv_df, lda_tfidf_df):
    non_lda_max_model_df = pd.concat([nmf_cv_df['nmf_cv_sum'],nmf_tfidf_df['nmf_tfidf_sum'], lsa_cv_df['lsa_cv_sum'], lsa_tfidf_df['lsa_tfidf_sum']], axis=1)
    non_lda_max_model_df['non_lda_max_value'] = non_lda_max_model_df.max(axis=1)
    non_lda_max_model_df['non_lda_max_model'] = non_lda_max_model_df.idxmax(axis=1)
    
    non_lda_max_topic_df = pd.concat([nmf_cv_df, nmf_tfidf_df, lsa_cv_df, lsa_tfidf_df], axis=1)
    non_lda_max_topic_df.drop(columns=['nmf_cv_sum', 'nmf_tfidf_sum', 'lsa_cv_sum', 'lsa_tfidf_sum'], inplace=True)
    non_lda_max_topic_df['non_lda_max_topic']  = non_lda_max_topic_df.idxmax(axis=1)
        
    lda_max_topic_df = pd.concat([lda_cv_df, lda_tfidf_df], axis=1)
    lda_max_topic_df.drop(columns=['lda_cv_sum','lda_tfidf_sum'], axis=1, inplace=True)
    lda_max_topic_df['lda_max_topic'] = lda_max_topic_df.idxmax(axis=1)
    
    final_df = pd.DataFrame()
    final_df[text_used] = cleaned_df[text_used]
    final_df['username'] = cleaned_df['username']
    final_df['non_lda_max_topic'] = non_lda_max_topic_df['non_lda_max_topic']
    final_df['lda_max_topic'] = lda_max_topic_df['lda_max_topic']
    final_df['non_lda_topic_name'] = map_topic_names(final_df, 'non_lda_max_topic', topics_dict)
    final_df['lda_topic_name'] = map_topic_names(final_df, 'lda_max_topic', topics_dict)
        
    final_df['non_lda_max_model'] = non_lda_max_model_df['non_lda_max_model']
    final_df['non_lda_max_value'] = non_lda_max_model_df['non_lda_max_value']
    
    return final_df

# Obtain compiled modeling df

In [358]:
selfharmmm_final_df = compile_combo_dfs(all_corpuses_df, 'cleaned_text', selfharmmm_dictionary, nmf_cv_df, nmf_tfidf_df, lsa_cv_df, lsa_tfidf_df, lda_cv_df, lda_tfidf_df)

In [359]:
selfharmmm_final_df

Unnamed: 0,cleaned_text,username,non_lda_max_topic,lda_max_topic,non_lda_topic_name,lda_topic_name,non_lda_max_model,non_lda_max_value
0,space boy,spaceeblack,lsa_tfidf_topic2,lda_cv_topic3,spam,depression,lsa_tfidf_sum,2.037532e-02
1,seen it all before bring me the horizon,vongriffis,lsa_cv_topic1,lda_cv_topic1,depression,eating,lsa_cv_sum,2.186021e-02
2,uicideboy kill yourself part iii,vongriffis,lsa_cv_topic1,lda_cv_topic1,depression,eating,nmf_tfidf_sum,8.783881e-03
3,i feel like im a no one thats what they told me,vongriffis,lsa_cv_topic1,lda_cv_topic2,depression,spam,lsa_cv_sum,1.050292e+00
4,issues the worst of them,vongriffis,lsa_cv_topic1,lda_cv_topic4,depression,eating,lsa_cv_sum,1.286314e-02
5,hate it when you fight me love it when i die slow,vongriffis,lsa_cv_topic4,lda_cv_topic4,art,eating,lsa_cv_sum,1.013281e+00
6,【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところに...,yuckcore,lsa_tfidf_topic5,lda_cv_topic2,spam,spam,lsa_tfidf_sum,1.355725e-09
7,oceaniccunt,yuckcore,lsa_tfidf_topic4,lda_tfidf_topic1,eating,spam,nmf_cv_sum,0.000000e+00
8,httpstwittercompigpagestatus,yuckcore,lsa_tfidf_topic5,lda_cv_topic5,spam,depression,lsa_tfidf_sum,1.323762e-09
9,rooftop herb,yuckcore,lsa_cv_topic2,lda_cv_topic4,depression,eating,nmf_tfidf_sum,5.624583e-04


In [362]:
pickle.dump(selfharmmm_final_df, open('iteration1_files/epoch1/selfharmmm_final_df.pkl', 'wb'))

# Define a function that would provide a random sample 

In [394]:
def random_sample(final_df, criterion1=None, value1=None, criterion2=None, value2=None, use_one_criterion=False, use_two_criteria=False, sample_size=.3, random_state=30):
    sample_size = sample_size
    random_state = random_state
    
    if use_two_criteria == True:
        new_df = final_df[(final_df[criterion1] == value1) & (final_df[criterion2] == value2)]
    elif use_one_criterion == True:
        new_df = final_df[final_df[criterion1] == value1]
    else:
        new_df = final_df.copy()
    
    return new_df.sample(frac=sample_size)

# Investigate posts with lsa_tfidf as max model, as that model had mostly porn/spam topics

In [363]:
lsa_tfidf_sample = random_sample(selfharmmm_final_df, criterion1='non_lda_max_model', value1='lsa_tfidf_sum', use_one_criterion=True)

In [368]:
len(lsa_tfidf_sample)

379

In [367]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)
lsa_tfidf_sample

Unnamed: 0,cleaned_text,username,non_lda_max_topic,lda_max_topic,non_lda_topic_name,lda_topic_name,non_lda_max_model,non_lda_max_value
6830,passion,unspiritedaway,lsa_cv_topic4,lda_cv_topic3,art,depression,lsa_tfidf_sum,0.002334112
817,queensbelike,annaxhoang,lsa_tfidf_topic4,lda_cv_topic4,eating,eating,lsa_tfidf_sum,5.034679e-09
8342,lost fantasia mnb do not edit,officialjaehwan,lsa_cv_topic1,lda_cv_topic2,depression,spam,lsa_tfidf_sum,0.02484648
7434,httpswwwinstagramcomthepersonalquotes,eurphoric-afflictions,lsa_tfidf_topic5,lda_cv_topic4,spam,eating,lsa_tfidf_sum,0.02093805
6787,by nishe,knowing,lsa_tfidf_topic5,lda_cv_topic5,spam,depression,lsa_tfidf_sum,1.124936e-06
2619,via weheartit,le-insomni-e,lsa_tfidf_topic1,lda_cv_topic4,spam,eating,lsa_tfidf_sum,1.000046
7789,edit,killingstrangers666,lsa_tfidf_topic5,lda_cv_topic2,spam,spam,lsa_tfidf_sum,0.02517968
8303,how tall and heavy are you,gutgrowing1,lsa_cv_topic1,lda_cv_topic2,depression,spam,lsa_tfidf_sum,0.003391252
2057,httpswwwinstagramcomlenaarte,lovely-sighss,lsa_tfidf_topic5,lda_tfidf_topic1,spam,spam,lsa_tfidf_sum,5.821721e-09
5614,exercises,daintyintheflesh,lsa_tfidf_topic5,lda_tfidf_topic1,spam,spam,lsa_tfidf_sum,6.03673e-10


## Identify the false positives

In [369]:
lsa_tfidf_sample_fp_list = [7249, 4175, 6574, 8012, 8197, 39, 5628]

# Define a function to determine precision score

In [383]:
def get_precision_score(sample_df, fp_list):
    FP = len(fp_list)
    TP = len(sample_df) - FP
    
    precision_score = TP/(TP + FP)
    return precision_score

## Obtain precision score of false positives (not spam/porn) for posts with lsa_tfidf as max model

In [384]:
lsa_tfidf_sample_precision = get_precision_score(lsa_tfidf_sample, lsa_tfidf_sample_fp_list)
lsa_tfidf_sample_precision

0.9815303430079155

# Define a function to remove unrelated posts

In [403]:
def remove_unrelated(final_df, criterion1, value1, criterion2=None, value2=None, use_two_criteria=False):
    
    if use_two_criteria == True:
        new_df = final_df[(final_df[criterion1] != value1) & (final_df[criterion2] != value2)]
    else:
        new_df = final_df[final_df[criterion1] != value1]
        new_df.reset_index(inplace=True)
        new_df.drop(column=['index'], inplace=True)
    return new_df

# OFFICIAL REMOVAL 1: Remove posts with 'lsa_tfidf_sum' as max model

In [385]:
selfharmmm_final_df_removed_1 = remove_unrelated(selfharmmm_final_df, 'non_lda_max_model', 'lsa_tfidf_sum')

In [386]:
pickle.dump(selfharmmm_final_df_removed_1, open('iteration1_files/epoch1/selfharmmm_final_df_removed_1.pkl', 'wb'))

# Investigate posts with lda_tfidf topics 1, 2, and 4, for spam and porn

In [419]:
lda_tfidf_almost_all = selfharmmm_final_df_removed_1[(selfharmmm_final_df_removed_1['lda_max_topic'] == 'lda_tfidf_topic1') | (selfharmmm_final_df_removed_1['lda_max_topic'] == 'lda_tfidf_topic2') | (selfharmmm_final_df_removed_1['lda_max_topic'] == 'lda_tfidf_topic4')]

In [425]:

lda_tfidf_sample = random_sample(lda_tfidf_almost_all)

In [426]:
len(lda_tfidf_sample)

293

In [427]:
lda_tfidf_sample.drop(columns=['index'], inplace=True)

In [428]:
lda_tfidf_sample

Unnamed: 0,cleaned_text,username,non_lda_max_topic,lda_max_topic,non_lda_topic_name,lda_topic_name,non_lda_max_model,non_lda_max_value
2922,derblacksheepcamerawithflash,derblacksheep,lsa_tfidf_topic5,lda_tfidf_topic1,spam,spam,lsa_cv_sum,1.213578e-15
4214,httpsyoutubewovpzkkzag,katalinathecat,lsa_tfidf_topic2,lda_tfidf_topic1,spam,spam,lsa_cv_sum,2.385013e-16
3419,the ps turned less then a week ago wow i feel old,funnygamememes,lsa_cv_topic1,lda_tfidf_topic1,depression,spam,lsa_cv_sum,0.321974
4996,every single one dude every single one of your draws are so intense andor lovely andor sexy how do u do that how can u,nadaboodraws,lsa_cv_topic1,lda_tfidf_topic4,depression,porn,lsa_cv_sum,0.04659547
1911,fall out boy releases man i a and is on tour panic at the disco about to release pray for the wicked twenty øne piløts,delicatelyloudcherryblossom,lsa_cv_topic4,lda_tfidf_topic4,art,porn,lsa_cv_sum,0.06249808
2938,respite of the grey knight past a glen of ancient trees upon a mountain of stone and blades a lone knight sits,chivesxp,lsa_cv_topic1,lda_tfidf_topic4,depression,porn,lsa_cv_sum,0.0143543
1398,reblog to wake up a lighter number than you were yesterday cherryblossomcherryblossom,xxwrenaddamsxx,lsa_cv_topic1,lda_tfidf_topic2,depression,spam,lsa_cv_sum,0.0444992
3538,me is chillsomeone chillme lets fight,nirvanaisthedestination-blog,lsa_cv_topic1,lda_tfidf_topic1,depression,spam,lsa_cv_sum,0.0150082
5316,guess my favorite color,eat-apples,lsa_cv_topic1,lda_tfidf_topic2,depression,spam,lsa_cv_sum,0.02975809
219,sorry about the lack of stream today guys im catching up on some work things ill make up for it v,wolfpainters,lsa_cv_topic1,lda_tfidf_topic2,depression,spam,nmf_cv_sum,0.2719114


## Identify false positives

In [429]:
lda_tfidf_sample_fp_list = [7421, 3557, 6899, 2269, 6916, 248, 4492, 833, 6918, 2688, 5127, 5196, 4884, 5050, 2131, 2502, 2125, 969, 706, 698, 3918, 3242, 7004, 3278]

In [430]:
get_precision_score(lda_tfidf_sample, lda_tfidf_sample_fp_list)

0.9180887372013652

# OFFICIAL REMOVAL 2: Remove posts with lda_tfidf topics 1, 2, and 4

In [434]:
selfharmmm_final_df_removed_2 = selfharmmm_final_df_removed_1[(selfharmmm_final_df_removed_1['lda_max_topic'] != 'lda_tfidf_topic1') & (selfharmmm_final_df_removed_1['lda_max_topic'] != 'lda_tfidf_topic2') & (selfharmmm_final_df_removed_1['lda_max_topic'] != 'lda_tfidf_topic4')]

In [435]:
selfharmmm_final_df_removed_2.shape

(6531, 9)

In [436]:
selfharmmm_final_df_removed_2

Unnamed: 0,index,cleaned_text,username,non_lda_max_topic,lda_max_topic,non_lda_topic_name,lda_topic_name,non_lda_max_model,non_lda_max_value
0,1,seen it all before bring me the horizon,vongriffis,lsa_cv_topic1,lda_cv_topic1,depression,eating,lsa_cv_sum,2.186021e-02
1,2,uicideboy kill yourself part iii,vongriffis,lsa_cv_topic1,lda_cv_topic1,depression,eating,nmf_tfidf_sum,8.783881e-03
2,3,i feel like im a no one thats what they told me,vongriffis,lsa_cv_topic1,lda_cv_topic2,depression,spam,lsa_cv_sum,1.050292e+00
3,4,issues the worst of them,vongriffis,lsa_cv_topic1,lda_cv_topic4,depression,eating,lsa_cv_sum,1.286314e-02
4,5,hate it when you fight me love it when i die slow,vongriffis,lsa_cv_topic4,lda_cv_topic4,art,eating,lsa_cv_sum,1.013281e+00
6,9,rooftop herb,yuckcore,lsa_cv_topic2,lda_cv_topic4,depression,eating,nmf_tfidf_sum,5.624583e-04
7,10,fat boys fat boys what you gonna do what you gonna do when they crawl on you,yuckcore,lsa_cv_topic1,lda_cv_topic5,depression,depression,nmf_cv_sum,4.491532e-02
8,11,eye marbles,yuckcore,lsa_cv_topic2,lda_cv_topic2,depression,spam,nmf_cv_sum,1.178881e-03
9,13,this striking mushroom has no common name so well call it by its scientific name hygrocybe astatogala it belongs to a group,yuckcore,lsa_cv_topic1,lda_cv_topic2,depression,spam,lsa_cv_sum,9.451377e-03
10,14,im wondering if maybe maybe they can have mary be instrumental in helping to rescue dean maybe this will be the wakeup call,thejabberwock,lsa_cv_topic1,lda_cv_topic3,depression,depression,nmf_cv_sum,2.381624e-01


# Prepare updated dataset for Epoch 2

In [437]:
epoch2_df = selfharmmm_final_df_removed_2.copy()

In [439]:
epoch2_df.reset_index(inplace=True)

In [440]:
epoch2_df.columns

Index(['level_0', 'index', 'cleaned_text', 'username', 'non_lda_max_topic',
       'lda_max_topic', 'non_lda_topic_name', 'lda_topic_name',
       'non_lda_max_model', 'non_lda_max_value'],
      dtype='object')

In [441]:
epoch2_df.drop(columns=['level_0', 'index','non_lda_max_topic','lda_max_topic', 'non_lda_topic_name', 'lda_topic_name', 'non_lda_max_model', 'non_lda_max_value'], inplace=True)

In [442]:
epoch2_df

Unnamed: 0,cleaned_text,username
0,seen it all before bring me the horizon,vongriffis
1,uicideboy kill yourself part iii,vongriffis
2,i feel like im a no one thats what they told me,vongriffis
3,issues the worst of them,vongriffis
4,hate it when you fight me love it when i die slow,vongriffis
5,rooftop herb,yuckcore
6,fat boys fat boys what you gonna do what you gonna do when they crawl on you,yuckcore
7,eye marbles,yuckcore
8,this striking mushroom has no common name so well call it by its scientific name hygrocybe astatogala it belongs to a group,yuckcore
9,im wondering if maybe maybe they can have mary be instrumental in helping to rescue dean maybe this will be the wakeup call,thejabberwock


In [443]:
pickle.dump(epoch2_df, open('iteration1_files/epoch2/epoch2_df.pkl', 'wb'))