# Overview

In [1]:
from typing import List, Optional
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *
import string
import re

In [4]:
df_utters = pd.read_parquet('data/samp_utterances.parquet')
df_utters.head()


Unnamed: 0,conv_id,timestamp,source,text
0,0cd998517ae0f3501e4b9af197d047877116663e,1602761629000,expert,Article URL: https://www.bbc.co.uk/news/uk-wal...
1,0cd998517ae0f3501e4b9af197d047877116663e,1602761629000,expert,Title: Snowdonia visitor pass scheme 'to curb ...
2,0cd998517ae0f3501e4b9af197d047877116663e,1602761629000,client,There are plans for Snowdonia visitors to use ...
3,0cd998517ae0f3501e4b9af197d047877116663e,1602761629000,client,It would see visitors use larger park and ride...
4,0cd998517ae0f3501e4b9af197d047877116663e,1602761629000,client,Meanwhile parking would be largely removed in ...


# Vectorize utterances

## Functions

In [2]:
# Davidson's custom tokenizer
stemmer = PorterStemmer()
def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    #tokens = re.split("[^a-zA-Z]*", tweet.lower())
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

# Function to gather and tally hateful n-grams found in a given article and calculate the "hatescore" for that article
def tally_counts_doc(row):
    row2 = row[row > 0]
    score = 0
    hits = {}
    for index,val in row2.items():
        hits[index]=val
        hit = dict_hateweights[index] * val
        score += hit 
    row['hate_score'] = score
    row['hate_hits'] = hits

    return row

In [1]:
df_utters.head()


NameError: name 'df_utters' is not defined

## Vectorization

In [3]:
# __________________Load DF of Hateful terms
df_hate_words = pd.read_csv('data/raw_other/hateful_ngrams.csv')
df_hate_words.set_index('ngram',drop=True,inplace=True)
dict_hateweights = df_hate_words['prophate'].to_dict()
hate_list = list(df_hate_words.index)

#____________________ Load text to vectorize

cols = ['conv_id',  'timestamp', 'source', 'text']
df_utters = pd.read_parquet('out/combined_utterances.parquet',names=cols)
#df_utters = df_utters[df_utters['source'] == 'client']

df_utters = df_utters.head(12000) #-------------------------------abbrveiate sample
df_utters.reset_index(inplace=True,drop=True)

#____________________instantiate and fit the vectorizer



unlab_vectors = unlab_cvect.fit(df_utters['text'])
unlab_matrix = unlab_vectors.transform(df_utters['text'])
df_vectors = pd.DataFrame(unlab_matrix.todense(),columns=unlab_vectors.get_feature_names())





## Tally hate score

In [5]:
#______________________________________Gather hate counts and calculate hate scores

df_vectors = df_vectors.apply(lambda row:tally_counts_doc(row),axis=1,result_type='expand')

df_hatecounts = pd.concat([df_utters,df_vectors],axis=1)

df_hatecounts['hate_hits'] = df_hatecounts['hate_hits'].astype('string')



In [6]:
df_hatecounts.head()

Unnamed: 0,conv_id,timestamp,source,text,allah akbar,blacks,chink,chinks,dykes,faggot,...,is full of white,lame nigga you a,many niggers are in,nigga you a lame,niggers are in my,wit a lame nigga,you a lame bitch,you fuck wit a,hate_score,hate_hits
0,c59245766b9e0616c3e5040be3822e0539cca091,1610019564000,client,Want to get The Morning by email? Here’s the s...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,{}
1,c59245766b9e0616c3e5040be3822e0539cca091,1610019564000,client,Donald Trump has been attacking American democ...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,{}
2,c59245766b9e0616c3e5040be3822e0539cca091,1610019564000,client,"He has told repeated lies about voter fraud, u...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,{}
3,c59245766b9e0616c3e5040be3822e0539cca091,1610019564000,client,"Yesterday, hundreds of those supporters decide...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,{}
4,c59245766b9e0616c3e5040be3822e0539cca091,1610019564000,client,"They fought their way through armed police, sm...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,{}


In [7]:
df_hatecounts = df_hatecounts[df_hatecounts['hate_score'] > 0]

In [8]:
df_hate_counts = df_hatecounts.sort_values('hate_score',ascending = False)

In [9]:
df_hate_counts

Unnamed: 0,conv_id,timestamp,source,text,allah akbar,blacks,chink,chinks,dykes,faggot,...,is full of white,lame nigga you a,many niggers are in,nigga you a lame,niggers are in my,wit a lame nigga,you a lame bitch,you fuck wit a,hate_score,hate_hits
6747,4df1a2a70d598d520dc0b1dc4704e051f8661166,1610811010000,client,"“This work is for me an old work, but it has a...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,{'all white': 1}
6629,a7a86275c5f549a5d143b1a520cfdd268d490433,1610011223173,client,"Remember Pizzagate, when a North Carolina man ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.667,{'slave': 1}
1697,1256e5f84f9d83946d9470ab121de49549deffef,1609983874000,client,Kuenssberg says 'nitty-gritty' on Brexitcast d...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.667,{'slave': 1}
4178,1256e5f84f9d83946d9470ab121de49549deffef,1609983874000,client,Kuenssberg says 'nitty-gritty' on Brexitcast d...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.667,{'slave': 1}
1922,9e30128ea3a8329a8418214b87e7a98ad49175da,1610015617000,client,The rioting at the Capitol left a woman dead. ...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.642,{'the female': 1}
4403,9e30128ea3a8329a8418214b87e7a98ad49175da,1610015617000,client,The rioting at the Capitol left a woman dead. ...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.642,{'the female': 1}
11713,10e2287443f679b94bc44df2fa268aab444eda46,1609987579000,client,Republicans who needed to see the Capitol seiz...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,{'their heads': 1}
1358,b3b68b8df842ff434cea1b30f394a9fb6bc76c4a,1610938800000,client,"Then, once you're finished playing, the whole ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,{'their heads': 1}
9781,2b5ca486b92c19018211afea99242f5edaf4a4e8,1609992832000,client,"""It's really sad and in a lot of respects emba...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,{'their heads': 1}
8945,9512b4143e6b7145dd79cadfdad3a6420d3e3db3,1609978933000,client,The chamber filled with frantic shouting and t...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,{'their heads': 1}


In [11]:
df_counts = df_vectors.sum(axis=0)
df_counts.sort_values(inplace=True,ascending = False)
df_counts = df_counts[df_counts > 0]
df_counts

hate_score     26.163
of white       12.000
blame the       8.000
blacks          6.000
married to      6.000
their heads     5.000
slave           3.000
whites          2.000
the female      2.000
all white       1.000
dtype: float64

# Spark vectorization

In [2]:
import findspark


import pyspark
import random
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType
import pyspark.sql.functions as F
from pyspark.ml.feature import NGram
findspark.init()

sc.stop()
sc = SparkContext('local')
spark = SparkSession(sc)

cols = ['conv_id',  'timestamp', 'source', 'text']
df_utters = spark.read.csv('out/article_data_df_1_1.ftr.csv')
df_utters = df_utters.selectExpr("_c0 as conv_id","_c1 as timestamp","_c2 as source","_c3 as text")

df_utters = df_utters.limit(5000)

#___________________________  Tokenize

stemmer = PorterStemmer()
def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    #tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    tokens = re.split("[^a-zA-Z]*", tweet.lower().strip())
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

token_udf = udf(tokenize,ArrayType(StringType()))
#df_utters  = df_utters.withColumn("tokens", token_udf(F.col('text')))

from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
df_utters = tokenizer.transform(df_utters)
#____________________________ n-grams


for i in range(1,5):
    
    ngram = NGram(n=i,inputCol="tokens",outputCol='n' + str(i))

    df_utters = ngram.transform(df_utters)
    
add_lists = F.udf(lambda a,b,c,d : a + b + c + d,ArrayType(StringType()))
df_utters = df_utters.withColumn('n_all',add_lists('n1','n2','n3','n4'))


ModuleNotFoundError: No module named 'pyspark'

In [None]:
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType

col_len = F.udf(lambda a: len(a),IntegerType())

df_utters = df_utters.withColumn('token_len',col_len('tokens'))
df_utters = df_utters.withColumn('n_all_len',col_len('n_all'))

for i in range(2,5):
    df_utters = df_utters.withColumn('n'+ str(i) + '_len',col_len('n'+ str(i)))

In [None]:
df_utters.where(df_utters['n_all_len'] <= 14).select('n_all_len').show()

In [None]:
from pyspark.ml.feature import CountVectorizer

# __________________Load DF of Hateful terms
df_hate_words = pd.read_csv('data/raw_other/hateful_ngrams.csv')
df_hate_words.set_index('ngram',drop=True,inplace=True)
dict_hateweights = df_hate_words['prophate'].to_dict()
hate_list = list(df_hate_words.index)

df_utters = df_utters.na.drop("all")

cv = CountVectorizer(inputCol="n_all", outputCol="features")

cvec_model = cv.fit(df_utters)
cvec_model.transform(df_utters).show()


In [3]:
import dask.dataframe as dd


# Dask?

In [9]:
cols = ['conv_id',  'timestamp', 'source', 'text']
df_utters = dd.read_csv('out/article_data_df_1_1.ftr.csv',names=cols)

In [10]:
df_utters

Unnamed: 0_level_0,conv_id,timestamp,source,text
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,int64,object,object
,...,...,...,...
,...,...,...,...
