<a href="https://colab.research.google.com/github/vedanta28/sentiment-analysis/blob/main/hateSpeechClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Installing Dependencies:

In [None]:
!pip install transformers
!pip install emojis
!pip install --upgrade google-api-python-client
!pip install quica

Importing the required libraries:

In [None]:
from transformers import pipeline
import pandas as pd
import re
import emoji
from tqdm import tqdm
import time
from quica.quica import Quica
import csv

from googleapiclient import discovery
API_KEY = 'PUT YOUR OWN API KEY HERE'
client = discovery.build('commentanalyzer',
                         'v1alpha1',
                         developerKey=API_KEY,
                         discoveryServiceUrl='https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1',
                         static_discovery=False)

toxicBERT:

In [None]:
def replace_emojis_with_words(text):
  return emoji.demojize(text, delimiters=("", ""))

model = pipeline(model='unitary/toxic-bert')
def predict(text):
    return model([text])[0]['score']

path = "/content/asian hate crime.csv" #Put the path name of the file here.
filename = "asian hate crime.csv" #Put the name of the file here.
df = pd.read_csv(path)
toxic = list()
toxic_binary = list()
for i in  tqdm(range(len(df))):
  text = re.sub(r'https?:\/\/\S*', 'LINK', str(df.iloc[i].Tweet))
  text = re.sub(r'\B@\w+', '@USER', text)
  text = text.replace('\n', ' ')
  text = replace_emojis_with_words(text)

  try:
    toxic_score = predict(text)
    toxic.insert(i, toxic_score)
    if toxic_score >= 0.5:
      toxic_binary.insert(i, 1)
    else:
      toxic_binary.insert(i, 0)
  except:
    print('Error at : ', i)

df['toxic_bert'] = toxic
df['toxic_bert_binary'] = toxic_binary

Google Perspective API:

In [None]:
df['perspective'] = 0
index = list(df.columns).index('perspective')
df['perspective_binary'] = 0
index_binary = list(df.columns).index('perspective_binary')

for i in tqdm(range(df.shape[0])):
    text = re.sub(r'https?:\/\/\S*', 'LINK', str(df.iloc[i].Tweet))
    text = re.sub(r'\B@\w+', '@USER', text)
    text = text.replace('\n', ' ')
    analyze_request = {'comment': { 'text': text},
                       'requestedAttributes': {'TOXICITY': {}},
                       'doNotStore': True}
    
    try:
        response = client.comments().analyze(body=analyze_request).execute()
        df.iat[i, index] = response['attributeScores']['TOXICITY']['summaryScore']['value']
        if df.iat[i, index] >= 0.5:
          df.iat[i, index_binary] = 1
    except:
        print('Error at : ', i)

    time.sleep(1)

df.to_csv(filename)

Calculating Krippendorff's Alpha and finding the disaggrements:

In [None]:
path = "/content/feminism.csv" #Put the path name of the file here.
df = pd.read_csv(path)

perspective_binary = []
toxic_bert_binary = []
disaggrements = []
for i in tqdm(range(df.shape[0])):
  # Discarding the Tweets which Perspective API call was not able to work.
    if (df.iloc[i].perspective != 0.0):
      toxic_bert_binary.append(df.iloc[i].toxic_bert_binary)
      perspective_binary.append(df.iloc[i].perspective_binary)
      # Collecting the tweets which the Models disaggreed on.
      if (df.iloc[i].toxic_bert_binary != df.iloc[i].perspective_binary):
        disaggrements.append([df.iloc[i, 1], df.iloc[i].User, df.iloc[i].Tweet, df.iloc[i].toxic_bert_binary, df.iloc[i].perspective_binary])


df_aggrement = pd.DataFrame({"toxic_bert_binary" : toxic_bert_binary, "perspective_binary" : perspective_binary})
quica = Quica(dataframe = df_aggrement)
print(quica.get_results())

Details = ['Unique ID', 'User', 'Tweet', 'toxic_bert_binary', 'perspective_binary']
with open('disaggrements.csv', 'w', encoding='UTF8') as f:
    write = csv.writer(f)
    write.writerow(Details)
    write.writerows(disaggrements)