# Import Libraries

In [2]:
# General system libraries
import os
import sys
from IPython.display import Image, Markdown
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Dataframe libraries
import pandas as pd
from pandas import DataFrame, read_csv

# Number manipulation
import scipy.sparse
from scipy.ndimage.filters import generic_filter
import patsy
import numpy as np

# Plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

# Data type libaries
from datetime import datetime as dt

# File manipulation
import pickle
import pandas.io.sql as pd_sql
from sqlalchemy import create_engine
import psycopg2 as pg
from flatten_json import flatten

# NLP libraries
import wikipedia as wiki
from nltk import word_tokenize, sent_tokenize,FreqDist
from nltk.corpus import stopwords
import gensim as gn
from gensim import corpora, models, similarities
from collections import defaultdict
from six import iteritems
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS

# Scraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from scraping_functions.tumblr_api import get_client
import pytumblr

# Stats libaries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model, metrics
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import svm, datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


# Other libaries
import geopy

  """)
2018-06-03 17:10:58,721 : INFO : 'pattern' package not found; tag filters are not available for English


## Import the list of hashtags previously scraped

In [36]:
selfharmmm_related_hashtags_list = pd.read_pickle('iteration1_files/selfharmmm_related_hashtags_list.pkl')

In [37]:
selfharmmm_related_hashtags_list

['#emo',
 '#binge',
 '#razor',
 '#silence',
 '#broken',
 '#emogirl',
 '#anerexic',
 '#sadedits',
 '#depressedboy',
 '#wasteofspace',
 '#suicideedit',
 '#suizidal',
 '#mentalillness',
 '#hatred',
 '#pathetic',
 '#numb',
 '#alone',
 '#suicidial',
 '#triggering',
 '#depressiv',
 '#selfharmmmm',
 '#brokenheart',
 '#cutting',
 '#worthless',
 '#dark',
 '#cry',
 '#lonely',
 '#depressedteen',
 '#depressededits',
 '#ugly',
 '#everythingiswrong',
 '#starve',
 '#sorry',
 '#nobody',
 '#bulimia',
 '#sadness',
 '#monstersinmyhead',
 '#useless',
 '#crying',
 '#suicidalvideo',
 '#secretsociety123',
 '#killme',
 '#death',
 '#depressededit',
 '#suicidaledits',
 '#cutted',
 '#fat',
 '#depressed',
 '#anamia',
 '#ana',
 '#selfinjury',
 '#selfharn',
 '#mia',
 '#dying',
 '#anorexia',
 '#drugs',
 '#imalone',
 '#depressedvideos',
 '#obese',
 '#iwanttodie',
 '#iwanttobeskinny',
 '#empty',
 '#gross',
 '#tears',
 '#pain',
 '#blood',
 '#helpme',
 '#emoboy',
 '#toofat',
 '#depression',
 '#deadinside',
 '#selfhate',

# Define functions to get tumbler usernames 

In [44]:
def get_tumblr_usernames_from_hashtags(list_of_hashtags):
    all_usernames = set()
    all_safety_pages = []
    
    for hashtag in list_of_hashtags:
        os.environ['webdriver.chrome.driver'] = 'chromedriver'
        driver = webdriver.Chrome()
        driver.get('https://www.tumblr.com/search/{}'.format(hashtag[1:]))
        usernames, safety_pages = get_tumblr_usernames(driver, hashtag)
        all_usernames = all_usernames.union(usernames)
        all_safety_pages.append(safety_pages)
    return list(all_usernames), all_safety_pages

In [14]:
def get_tumblr_usernames(driver, hashtag):
    """
    Get a list of all the usernames from the search results page.
    """
    safety_pages = []

    try:
        passed = pass_safety_page(driver)
    except:
        passed = False
    
    username_elements = driver.find_elements_by_class_name('post-info-tumblelog')
    usernames = [element.text for element in username_elements] 
    
    if passed:
        safety_pages.append([hashtag, 'safety passed'])
    else:
        safety_pages.append([hashtag, 'safety page missing'])

    return set(usernames), safety_pages

In [15]:
def pass_safety_page(driver):
    bypass = driver.find_element_by_class_name('footer_link')
    if bypass == None:
        return False
    else:
        bypass.click()
        return True

### Test these functions

In [40]:
selfharmmm_usernames_test, selfharmmm_safety_pages_test = get_tumblr_usernames_from_hashtags(selfharmmm_related_hashtags_list[:3])

In [41]:
selfharmmm_usernames_test

['haunted-destruction',
 'blackstarlove123',
 'zarathustra-would-be-proud',
 'veryheartbreaklife',
 'visualness',
 'stillawfullydepressed',
 'depressed-no-one-special',
 'silence-with-no-whispers',
 'tokiobordell',
 'silentsuiciderooms',
 'traintracks-for-wrists',
 'littleoceanwhisper',
 'youdomattertome',
 'dead-girls-never-cry',
 'lukesoutofmylimitt']

In [30]:
selfharmmm_safety_pages_test

[[['#emo', 'safety page missing']],
 [['#binge', 'safety page missing']],
 [['#razor', 'safety passed']]]

# Obtain usernames

In [47]:
selfharmmm_usernames, selfharmmm_safety_pages = get_tumblr_usernames_from_hashtags(selfharmmm_related_hashtags_list)

In [48]:
selfharmmm_usernames

['spaceeblack',
 'vongriffis',
 'yuckcore',
 'thejabberwock',
 'sticks-and-stones-ribs-and-bones',
 'splendidsam',
 'zareleonis',
 'sixpenceee',
 'sadtastical',
 'artwriting',
 'ediewirt',
 'bugheader',
 'unhave',
 'jonaschristoffersen',
 'progressivejudaism',
 'ryanrosslegs',
 'ash-in-a-trash',
 'absent-ghost',
 'avpdcommunity',
 'trip-with-the-sky',
 'deaddei',
 'eemilysm',
 'wolfpainters',
 'r-ozaay',
 'kpopbopz',
 'fuckoffbitchh-h',
 'sloth-overweight',
 'lewyn-martell',
 'carolulhoa',
 'yesterdaysprint',
 'fuelsthecomedy',
 'greekquotesforlife',
 'lupy22',
 'ripequotes',
 'perfectlygoodforyou',
 'mrsroot',
 'chromarrays',
 'no-hux-given',
 'crazyredhead4678',
 'vaporwavedepression',
 'juansendizon',
 'weighlessflower',
 'rohanok-a-vesztembe',
 'janvranovsky',
 'dyingkami',
 'rainbowhaze420',
 'akii-deathwings',
 'childoflamb',
 'thin-vs-binge',
 'r0ttenpr0duce',
 'm-beks-blog',
 'thegenderisjustalie',
 'were-all-queer-here',
 'tarotprose',
 'kisaragi-aine',
 'marksandrec',
 'cigar

In [49]:
pickle.dump(selfharmmm_usernames, open('iteration1_files/selfharmmm_usernames.pkl', 'wb'))

In [53]:
def get_client():
    CONSUMER_KEY = 'OOJii0xL1lndypB7OXNALRUOjoh9L4UB9ODnctfIMML9tnBAjj'
    CONSUMER_SECRET = 'jqUWbdwv3RZCq1hOlREXMPzU4k6jWX8WJbM2CK3jltexlv59Kj'
    OAUTH_TOKEN = 'nY8bKKlm6zRhxfF4UxiXq4dECvOklyqmaFIh1IqH6Fb7ENvO7U'
    OAUTH_TOKEN_SECRET = 'k2hoNV78KjEsDuTbeMUWKzV2rXulAU86eUt1K32cgjMfWVC4BP'

    client = pytumblr.TumblrRestClient(
        CONSUMER_KEY,
        CONSUMER_SECRET,
        OAUTH_TOKEN,
        OAUTH_TOKEN_SECRET
    )

    return client

In [54]:
client = get_client()

In [66]:
def get_user_posts(username):
    username_json = flatten(client.posts(username))
    all_post_summaries = []
    for idx in range(20):
        try:
            post_summary = username_json['posts_{}_summary'.format(idx)]
            all_post_summaries.append([username, post_summary])
        except:
            break
    return all_post_summaries

In [72]:
def compile_raw_posts_df(list_of_usernames):
    all_users_posts = []
    for username in list_of_usernames:
        user_posts = get_user_posts(username)
        all_users_posts.extend(user_posts)
    raw_posts_df = pd.DataFrame(all_users_posts, columns=['username', 'text'])
    return raw_posts_df

In [68]:
selfharmmm_raw_text_df = compile_raw_posts_df(selfharmmm_usernames)

In [74]:
selfharmmm_raw_text_df.head(100)

Unnamed: 0,username,text
0,spaceeblack,
1,spaceeblack,
2,spaceeblack,
3,spaceeblack,
4,spaceeblack,
5,spaceeblack,
6,spaceeblack,
7,spaceeblack,
8,spaceeblack,
9,spaceeblack,


In [75]:
pickle.dump(selfharmmm_raw_text_df, open('iteration1_files/selfharmmm_raw_text_df.pkl', 'wb'))