# Overview

In [1]:
from typing import List, Optional
import feather
import pandas as pd
from glob import glob
from dateutil.parser import isoparse
from hashlib import sha1
from bs4 import BeautifulSoup
import bs4
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import string
import re
from nltk.stem.porter import *
import sqlite3

[nltk_data] Downloading package punkt to /home/umar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Vectorize utterances

## Functions

In [None]:
# Davidson's custom tokenizer
stemmer = PorterStemmer()
def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    #tokens = re.split("[^a-zA-Z]*", tweet.lower())
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

# Function to gather and tally hateful n-grams found in a given article and calculate the "hatescore" for that article
def tally_counts_doc(row):
    row2 = row[row > 0]
    score = 0
    hits = {}
    for index,val in row2.items():
        hits[index]=val
        hit = dict_hateweights[index] * val
        score += hit 
    row['hate_score'] = score
    row['hate_hits'] = hits

    return row

## Vectorization

In [None]:
# __________________Load DF of Hateful terms
df_hate_words = pd.read_csv('data/raw_other/hateful_ngrams.csv')
df_hate_words.set_index('ngram',drop=True,inplace=True)
dict_hateweights = df_hate_words['prophate'].to_dict()
hate_list = list(df_hate_words.index)

#____________________ Load text to vectorize

cols = ['conv_id',  'timestamp', 'source', 'text']
df_utters = pd.read_csv('out/article_data_df_1_1.ftr.csv',names=cols)
df_utters = df_utters[df_utters['source'] == 'client']

df_utters = df_utters.head(24000) #-------------------------------abbrveiate sample
df_utters.reset_index(inplace=True,drop=True)

#____________________instantiate and fit the vectorizer
unlab_cvect = CountVectorizer(
    ngram_range=(1,4),
    vocabulary = list(df_hate_words.index)
)


unlab_vectors = unlab_cvect.fit(df_utters['text'])
unlab_matrix = unlab_vectors.transform(df_utters['text'])
df_vectors = pd.DataFrame(unlab_matrix.todense(),columns=unlab_vectors.get_feature_names())





In [None]:
df_utters.head()

## Tally hate score

In [None]:
#______________________________________Gather hate counts and calculate hate scores
tqdm.pandas(desc="counting hate score")

df_vectors = df_vectors.progress_apply(lambda row:tally_counts_doc(row),axis=1,result_type='expand')

df_hatecounts = pd.concat([df_utters,df_vectors],axis=1)

df_hatecounts['hate_hits'] = df_hatecounts['hate_hits'].astype('string')



In [None]:
df_hatecounts.head()

In [None]:
df_hatecounts = df_hatecounts[df_hatecounts['hate_score'] > 0]

In [None]:
df_hate_counts = df_hatecounts.sort_values('hate_score',ascending = False)

In [None]:
df_hate_counts

In [None]:
df_counts = df_vectors.sum(axis=0)
df_counts.sort_values(inplace=True,ascending = False)
df_counts = df_counts[df_counts > 0]
df_counts.plot(kind='barh',figsize=(10,15))

# Spark vectorization

In [None]:
import findspark


import pyspark
import random
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType
import pyspark.sql.functions as F
from pyspark.ml.feature import NGram
findspark.init()

sc.stop()
sc = SparkContext('local')
spark = SparkSession(sc)

cols = ['conv_id',  'timestamp', 'source', 'text']
df_utters = spark.read.csv('out/article_data_df_1_1.ftr.csv')
df_utters = df_utters.selectExpr("_c0 as conv_id","_c1 as timestamp","_c2 as source","_c3 as text")

df_utters = df_utters.limit(5000)

#___________________________  Tokenize

stemmer = PorterStemmer()
def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    #tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    tokens = re.split("[^a-zA-Z]*", tweet.lower().strip())
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

token_udf = udf(tokenize,ArrayType(StringType()))
#df_utters  = df_utters.withColumn("tokens", token_udf(F.col('text')))

from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
df_utters = tokenizer.transform(df_utters)
#____________________________ n-grams


for i in range(1,5):
    
    ngram = NGram(n=i,inputCol="tokens",outputCol='n' + str(i))

    df_utters = ngram.transform(df_utters)
    
add_lists = F.udf(lambda a,b,c,d : a + b + c + d,ArrayType(StringType()))
df_utters = df_utters.withColumn('n_all',add_lists('n1','n2','n3','n4'))


In [None]:
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType

col_len = F.udf(lambda a: len(a),IntegerType())

df_utters = df_utters.withColumn('token_len',col_len('tokens'))
df_utters = df_utters.withColumn('n_all_len',col_len('n_all'))

for i in range(2,5):
    df_utters = df_utters.withColumn('n'+ str(i) + '_len',col_len('n'+ str(i)))

In [None]:
df_utters.where(df_utters['n_all_len'] <= 14).select('n_all_len').show()

In [None]:
from pyspark.ml.feature import CountVectorizer

# __________________Load DF of Hateful terms
df_hate_words = pd.read_csv('data/raw_other/hateful_ngrams.csv')
df_hate_words.set_index('ngram',drop=True,inplace=True)
dict_hateweights = df_hate_words['prophate'].to_dict()
hate_list = list(df_hate_words.index)

df_utters = df_utters.na.drop("all")

cv = CountVectorizer(inputCol="n_all", outputCol="features")

cvec_model = cv.fit(df_utters)
cvec_model.transform(df_utters).show()
