# Import Libraries

In [122]:
# General system libraries
import os
import sys
from IPython.display import Image, Markdown
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Dataframe libraries
import pandas as pd
from pandas import DataFrame, read_csv

# Number manipulation
import scipy.sparse
from scipy.ndimage.filters import generic_filter
import patsy
import numpy as np

# Plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

# Data type libaries
from datetime import datetime as dt

# File manipulation
import pickle
import pandas.io.sql as pd_sql
from sqlalchemy import create_engine
import psycopg2 as pg
from flatten_json import flatten

# NLP libraries
import wikipedia as wiki
from nltk import word_tokenize, sent_tokenize,FreqDist, pos_tag
from nltk.corpus import stopwords
import gensim as gn
from gensim import corpora, models, similarities
from collections import defaultdict
from six import iteritems
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS
import string
import emoji
import enchant
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer


# Scraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from scraping_functions.tumblr_api import get_client
import pytumblr

# Stats libaries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model, metrics
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import svm, datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


# Other libaries
import geopy
import pickle_functions as pf

# Import Raw Posts DF

In [10]:
selfharmmm_raw_text_df = pd.read_pickle('iteration1_files/selfharmmm_raw_text_df.pkl')

In [11]:
selfharmmm_raw_text_df.head(5)

Unnamed: 0,username,text
0,spaceeblack,
1,spaceeblack,
2,spaceeblack,
3,spaceeblack,
4,spaceeblack,


# Define functions to clean the text 

Need to clean the text of the following items:
1. Punctuations and numbers
2. Emojis
3. Other languages
4. Empty strings

In [22]:
punctuation = string.punctuation

In [23]:
type(punctuation)

str

In [51]:
'uicideboy // kill yourself (part iii)'.replace('/', '')

'uicideboy  kill yourself (part iii)'

In [85]:
def clean_posts_without_sentences(raw_post_df):
    cleaned_strings = []
    punctuation = '!"#$%&\\\()*+,-./:;<=>?@[]^_`{|}~/….“”’\n🌹'
    for string in raw_post_df['text']:
        string = string.strip().lower()
        string = replace_emoji_with_text(string)
        for char in string:
            if char in punctuation:
                string = string.replace(char,'')
            if char in ['0','1','2','3','4','5','6','7','8','9']:
                string = string.replace(str(char), '')
        cleaned_strings.append(string.strip())
    new_df = raw_post_df.copy()
    new_df['cleaned_text'] = cleaned_strings
    new_df['cleaned_text'].replace('', np.nan, inplace=True)
    new_df.dropna(inplace=True)
    return new_df

In [3]:
def replace_emoji_with_text(string):
    string_list = list(string)
    for char in string_list:
        if char in emoji.UNICODE_EMOJI.keys():
            string_list.remove(char)
            string_list.append(emoji.UNICODE_EMOJI[char])
    return ''.join(string_list).strip()

## Test these Functions

In [87]:
cleaned_posts_test = clean_posts_without_sentences(selfharmmm_raw_text_df[:100])

In [88]:
cleaned_posts_test

Unnamed: 0,username,text,cleaned_text
19,spaceeblack,Space Boy,space boy
22,vongriffis,seen it all before // bring me the horizon,seen it all before bring me the horizon
26,vongriffis,$UICIDEBOY$ // Kill Yourself (Part III),uicideboy kill yourself part iii
30,vongriffis,"I feel like I’m a no one, that’s what they tol...",i feel like im a no one thats what they told me
31,vongriffis,Issues - The Worst Of Them,issues the worst of them
36,vongriffis,"Hate it when you fight me, love it when I die ...",hate it when you fight me love it when i die slow
39,vongriffis,designed by blck-xcvi.tumblr.com instagram: @b...,designed by blckxcvitumblrcom instagram blckxcvi
40,yuckcore,“【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところ...,【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところに...
41,yuckcore,@oceaniccunt,oceaniccunt
43,yuckcore,https://twitter.com/pig_page/status/9921757729...,httpstwittercompigpagestatus


# Obtain Cleaned Strings

In [123]:
cleaned_df = clean_posts_without_sentences(selfharmmm_raw_text_df)

In [124]:
pickle.dump(cleaned_df, open('iteration1_files/cleaned_df.pkl', 'wb'))

# Define a function to identify non-English text

In [81]:
def identify_non_english(cleaned_df, max_allowed):
    en = enchant.Dict("en_US")
    language = []
    all_non_english = []
    for string in cleaned_df['cleaned_text']:
        non_english_words = []
        for word in string.split():
            if en.check(word) == False:
                non_english_words.append(word)
        if len(non_english_words) >= max_allowed:
            language.append('not_english')
            all_non_english.append(non_english_words)
        else:
            language.append('english')
            all_non_english.append('all_english')
    new_df = cleaned_df.copy()
    new_df['language'] = language
    new_df['non_english_words'] = all_non_english
    return new_df

## Test this function

In [89]:
identify_non_english(cleaned_posts_test, 3)

Unnamed: 0,username,text,cleaned_text,language,non_english_words
19,spaceeblack,Space Boy,space boy,english,all_english
22,vongriffis,seen it all before // bring me the horizon,seen it all before bring me the horizon,english,all_english
26,vongriffis,$UICIDEBOY$ // Kill Yourself (Part III),uicideboy kill yourself part iii,english,all_english
30,vongriffis,"I feel like I’m a no one, that’s what they tol...",i feel like im a no one thats what they told me,english,all_english
31,vongriffis,Issues - The Worst Of Them,issues the worst of them,english,all_english
36,vongriffis,"Hate it when you fight me, love it when I die ...",hate it when you fight me love it when i die slow,english,all_english
39,vongriffis,designed by blck-xcvi.tumblr.com instagram: @b...,designed by blckxcvitumblrcom instagram blckxcvi,not_english,"[blckxcvitumblrcom, instagram, blckxcvi]"
40,yuckcore,“【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところ...,【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところに...,english,all_english
41,yuckcore,@oceaniccunt,oceaniccunt,english,all_english
43,yuckcore,https://twitter.com/pig_page/status/9921757729...,httpstwittercompigpagestatus,english,all_english


# Identify not English

In [92]:
selfharmmm_english_identified_df = identify_non_english(cleaned_df, 3)

In [93]:
selfharmmm_english_identified_df.head(5)

Unnamed: 0,username,text,cleaned_text,language,non_english_words
19,spaceeblack,Space Boy,space boy,english,all_english
22,vongriffis,seen it all before // bring me the horizon,seen it all before bring me the horizon,english,all_english
26,vongriffis,$UICIDEBOY$ // Kill Yourself (Part III),uicideboy kill yourself part iii,english,all_english
30,vongriffis,"I feel like I’m a no one, that’s what they tol...",i feel like im a no one thats what they told me,english,all_english
31,vongriffis,Issues - The Worst Of Them,issues the worst of them,english,all_english


In [95]:
selfharmmm_removed_non_english_df = selfharmmm_english_identified_df[selfharmmm_english_identified_df['language'] == 'english']

In [97]:
selfharmmm_removed_non_english_df.drop(columns=['language', 'non_english_words'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [98]:
selfharmmm_removed_non_english_df.reset_index()

Unnamed: 0,index,username,text,cleaned_text
0,19,spaceeblack,Space Boy,space boy
1,22,vongriffis,seen it all before // bring me the horizon,seen it all before bring me the horizon
2,26,vongriffis,$UICIDEBOY$ // Kill Yourself (Part III),uicideboy kill yourself part iii
3,30,vongriffis,"I feel like I’m a no one, that’s what they tol...",i feel like im a no one thats what they told me
4,31,vongriffis,Issues - The Worst Of Them,issues the worst of them
5,36,vongriffis,"Hate it when you fight me, love it when I die ...",hate it when you fight me love it when i die slow
6,40,yuckcore,“【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところ...,【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところに...
7,41,yuckcore,@oceaniccunt,oceaniccunt
8,43,yuckcore,https://twitter.com/pig_page/status/9921757729...,httpstwittercompigpagestatus
9,47,yuckcore,rooftop 🌿,rooftop herb


In [100]:
selfharmmm_removed_non_english_df

Unnamed: 0,username,text,cleaned_text
19,spaceeblack,Space Boy,space boy
22,vongriffis,seen it all before // bring me the horizon,seen it all before bring me the horizon
26,vongriffis,$UICIDEBOY$ // Kill Yourself (Part III),uicideboy kill yourself part iii
30,vongriffis,"I feel like I’m a no one, that’s what they tol...",i feel like im a no one thats what they told me
31,vongriffis,Issues - The Worst Of Them,issues the worst of them
36,vongriffis,"Hate it when you fight me, love it when I die ...",hate it when you fight me love it when i die slow
40,yuckcore,“【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところ...,【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところに...
41,yuckcore,@oceaniccunt,oceaniccunt
43,yuckcore,https://twitter.com/pig_page/status/9921757729...,httpstwittercompigpagestatus
47,yuckcore,rooftop 🌿,rooftop herb


In [117]:
def stem_and_lemmatize(cleaned_df):
    stemmer = SnowballStemmer("english", ignore_stopwords=False)
    wnl = WordNetLemmatizer()
    all_stemmed_strings = []
    all_lemmatized_strings = []
    for string in cleaned_df['cleaned_text']:
        lemmatized_string = [wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(string))]
        words = word_tokenize(string)
        stemmed_string = []
        for word in words:
            word_stemmed = stemmer.stem(word)
            stemmed_string.append(word_stemmed)
        all_stemmed_strings.append(' '.join(stemmed_string))
        all_lemmatized_strings.append(' '.join(lemmatized_string))
    new_df = cleaned_df.copy()
    new_df['stemmed_text'] = all_stemmed_strings
    new_df['lemmatized_text'] = all_lemmatized_strings
    return new_df

## Test this function

In [119]:
all_corpuses_df_test = stem_and_lemmatize(selfharmmm_removed_non_english_df[:100])

In [120]:
all_corpuses_df_test

Unnamed: 0,username,text,cleaned_text,stemmed_text,lemmatized_text
19,spaceeblack,Space Boy,space boy,space boy,space boy
22,vongriffis,seen it all before // bring me the horizon,seen it all before bring me the horizon,seen it all befor bring me the horizon,see it all before bring me the horizon
26,vongriffis,$UICIDEBOY$ // Kill Yourself (Part III),uicideboy kill yourself part iii,uicideboy kill yourself part iii,uicideboy kill yourself part iii
30,vongriffis,"I feel like I’m a no one, that’s what they tol...",i feel like im a no one thats what they told me,i feel like im a no one that what they told me,i feel like im a no one thats what they tell me
31,vongriffis,Issues - The Worst Of Them,issues the worst of them,issu the worst of them,issue the worst of them
36,vongriffis,"Hate it when you fight me, love it when I die ...",hate it when you fight me love it when i die slow,hate it when you fight me love it when i die slow,hate it when you fight me love it when i die slow
40,yuckcore,“【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところ...,【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところに...,【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところに...,【新商品】「光るキノコのマグネット」暗闇でぼんやりと光る神秘的なマグネット。いろんなところに...
41,yuckcore,@oceaniccunt,oceaniccunt,oceaniccunt,oceaniccunt
43,yuckcore,https://twitter.com/pig_page/status/9921757729...,httpstwittercompigpagestatus,httpstwittercompigpagestatus,httpstwittercompigpagestatus
47,yuckcore,rooftop 🌿,rooftop herb,rooftop herb,rooftop herb


# Obtain dataframe of all corpus types

In [121]:
all_corpuses_df = stem_and_lemmatize(selfharmmm_removed_non_english_df)

In [125]:
pickle.dump(all_corpuses_df, open('iteration1_files/all_corpuses_df.pkl', 'wb'))