# Import Libraries

In [88]:
# General system libraries
import os
import sys
from IPython.display import Image, Markdown
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Dataframe libraries
import pandas as pd
from pandas import DataFrame, read_csv

# Number manipulation
import scipy.sparse
from scipy.ndimage.filters import generic_filter
import patsy
import numpy as np

# Plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

# Data type libaries
from datetime import datetime as dt

# File manipulation
import pickle
import pandas.io.sql as pd_sql
from sqlalchemy import create_engine
import psycopg2 as pg
from flatten_json import flatten

# NLP libraries
import wikipedia as wiki
from nltk import word_tokenize, sent_tokenize,FreqDist, pos_tag
from nltk.corpus import stopwords
import gensim as gn
from gensim import corpora, models, similarities
from collections import defaultdict
from six import iteritems
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS
import string
import emoji
import enchant
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer


# Scraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from scraping_functions.tumblr_api import get_client
import pytumblr

# Stats libaries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model, metrics
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import svm, datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


# Other libaries
import geopy

# Define a function to clean the text 

Need to clean the text of the following items:
1. Punctuations and numbers
2. Emojis
3. Other languages
4. Empty strings

In [3]:
punctuation = string.punctuation

In [4]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [64]:
test_df = pd.read_pickle('test_df.pkl')

In [87]:
def clean_posts_without_sentences(raw_post_df):
    cleaned_strings = []
    for string in raw_post_df['text']:
        string = string.lower()
        string = replace_emoji_with_text(string)
        for char in string:
            if char in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~':
                string.replace(char, '')
            if char in [0,1,2,3,4,5,6,7,8,9]:
                string.rplace(char, '')
        cleaned_strings.append(string.strip())
    new_df = raw_post_df.copy()
    new_df['cleaned_text'] = cleaned_strings
    new_df.dropna(inplace=True)
    return new_df

In [78]:
clean_posts_without_sentences(test_df)

Unnamed: 0,username,text,cleaned_text
0,greasyquotes,[[MORE]],[[MORE]]
1,greasyquotes,[[MORE]],[[MORE]]
2,greasyquotes,[[MORE]],[[MORE]]
3,greasyquotes,[[MORE]],[[MORE]]
4,greasyquotes,[[MORE]],[[MORE]]
5,greasyquotes,[[MORE]],[[MORE]]
6,greasyquotes,[[MORE]],[[MORE]]
7,greasyquotes,[[MORE]],[[MORE]]
8,greasyquotes,[[MORE]],[[MORE]]
9,greasyquotes,[[MORE]],[[MORE]]


In [86]:
def replace_emoji_with_text(string):
    string_list = list(string)
    for char in string_list:
        if char in emoji.UNICODE_EMOJI.keys():
            string_list.remove(char)
            string_list.append(emoji.UNICODE_EMOJI[char])
    return ''.join(string_list).strip()

In [73]:
# def replace_emoji_with_text(string):
#     for char in string:
#         if char in emoji.UNICODE_EMOJI.keys():
#             string.replace(char, emoji.UNICODE_EMOJI[char])
#     return string.strip()

In [76]:
replace_emoji_with_text('test this string 😕')

'test this string :confused_face:'

In [83]:
def identify_non_english(cleaned_df, max_allowed):
    en = enchant.Dict("en_US")
    language = []
    all_non_english = []
    for string in cleaned_df['cleaned_text']:
        non_english_words = []
        for word in string.split():
            if en.check(word):
                non_english_words.append(word)
        if len(non_english_words) >= max_allowed:
            language.append('not_english')
            all_non_english.append(non_english_words)
        else:
            language.append('english')
            all_non_english.append('all_english')
    new_df = cleaned_df.copy()
    new_df['language'] = language
    new_df['non_english_words'] = all_non_english
    return new_df

In [89]:
def stem_and_lemmatize(cleaned_df):
    stemmer = SnowballStemmer("english", ignore_stopwords=False)
    wnl = WordNetLemmatizer()
    all_stemmed_strings = []
    all_lemmatized_strings = []
    for string in cleaned_df['cleaned_text']:
        lemmatized_string = [wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(txt))]
        words = word_tokenize(string)
        stemmed_string = []
        for word in words:
            word_stemmed = stemmer.stem(word)
            stemmed_string.append(word_stemmed)
        all_stemmed_strings.append(stemmed_string)
        all_lemmatized_strings.append(lemmatized_string)
    new_df = cleaned_df.copy()
    new_df['stemmed_text'] = all_stemmed_strings
    new_df['lemmatized_text'] = all_lemmatized_strings
    return new_df