<a href="https://colab.research.google.com/github/victorknox/Hate-Speech-Detection-in-Hindi/blob/main/Hate_Speech_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Subjectivity Analysis
- We use Sentiment lexicon resource for hindi called Hindi Sentiwordnet.
- It has around 3000 prior-polarity subjective clues with POS tag, positive score, negative score and related terms(separated by comma)

In [16]:
SUBJCLUE = []                     # SUBJCLUE Data

with open('SUBJCLUE.txt') as f:   # Formatting SUBJCLUE Data
    for line in f:                # iterate over the lines of the file
      x = line.split()            # split the line into a list of words
      x[4] = x[4].split(',')      # split the list into a list of words
      SUBJCLUE.append(x)          # append the list to the list of lists

# After this, the data would be in this form:
# ['POS tag', 'SYSNET ID(Hindi WN)', 'Positive score', 'Negative score', List of related words]

# printing the first 5 rows
for key in SUBJCLUE[:5]:
  print(key[4])


['अनौपचारिक']
['मृत']
['परवर्ती']
['अच्छा', 'बढ़िया']
['सौभाग्यशाली', 'खुशकिस्मत', 'खुशनसीब', 'तक़दीर_वाला', 'नसीब_वाला', 'भाग्यवान', 'भाग्यशाली', 'ख़ुशक़िस्मत', 'ख़ुशनसीब']


## Reading the data


Note: The Dataset should be a csv file with Fields corresponding to Unique ID, Post, Labels Set 

In [17]:
import csv                                  # importing csv module
# csv file name

filename = "Dataset/valid.csv"                      # change this file name to whatever you want
  

fields = []                                 # defining fields
rows = []                                   # defining rows
  
with open(filename, 'r') as csvfile:        # opening csv file
    # creating a csv reader object
    csvreader = csv.reader(csvfile)
      
    # extracting field names through first row
    fields = next(csvreader)
  
    # extracting each data row one by one
    for row in csvreader:
        rows.append(row)
  
    # get total number of rows
    print("Total no. of rows: %d"%(csvreader.line_num))
  
# printing the field names
print('Field names are:' + ', '.join(field for field in fields))
  
#  printing first 5 rows
# Appending a score for each row
tot = 0
for row in rows:
    row.append(tot)
    # print(row)
# can be accessed using row[3]
for row in rows[:5]:
  print(row)

Total no. of rows: 2259
Field names are:Unique ID, Post, Labels Set
['1', 'दृढ़ इच्छा शक्ति से परिपूर्ण प्रणबदा के लिए देशहित सर्वोच्च रहा।\n\nउनका निधन हम सब के लिए अपूरणीय क्षति है।\nईश्वर दिवंगत आत्मा को अपने श्रीचरणों में स्थान दें। शोक संतप्त परिजनों के प्रति संवेदनाएं।\nऊं शांति!!!', 'non-hostile', 0]
['2', 'भारतीय जनता पार्टी rss वाले इतने गिरे हुए हैं जहां मैं रहती हूं वहां मेरी जासूसी  करा रहें है उसकी जासूस की पहचान मुझे अच्छी तरह है rss बीजेपी वाले की जासूस दिल्ली में कौन है उत्तर प्रदेश में कौन है हरियाणा राजस्थान में कौन है सबकी पहचान है मुझे मेरी नजर से बच नहीं सकते हो', 'defamation', 0]
['3', 'कोरोना से निपटने की तैयारी / दिल्ली में 10 हजार बेड वाला दुनिया का सबसे बड़ा कोविड केयर सेंटर शुरू, राजनाथ-शाह ने डीआरडीओ के 1 हजार बेड वाले सेंटर का भी उद्घाटन किया\nhttps://t.co/9rlQowAsFh #Delhi @ArvindKejriwal  @rajnathsingh @AmitShah @DRDO_India @WHO @crpfindia @ITBP_official', 'non-hostile', 0]
['4', 'गवर्नर कॉन्फ्रेंस में PM मोदी बोले- शिक्षा नीति में सरकार का दखल कम होना चा

## Checking score

Finding positive, negative and total scores for each sentence



In [18]:
count = 0                             # initialize count
for key in SUBJCLUE:                  # for each word in SUBJCLUE
  subjlist = key[4]                   # get the list of subjects         
# subjlist = ['इच्छा', 'आत्मा', 'इतने']
  for row in rows:                    # for each row in the csv file
    if any([subjword in row[1] for subjword in subjlist]):  # if any of the words in the list are in the row's text
      count += 1            # increment count
      pos = float(key[2])   # get the pos value
      neg = float(key[3])   # get the neg value
      tot = pos - neg       # calculate the total
      row[3] += tot         # add the total to the row's total

# printing the number of occurences of sentiment words in dataset
print(count)

10530


# Hate Lexicon Growing


In [19]:
# Installing required modules
!pip install stanza
!pip install setuptools
!pip install subzero
!pip install inltk



In [20]:
SYNSET = []                                                     # SYNSET is a list of lists
with open('Synset.txt', encoding= 'unicode_escape') as f:       # opening synset.txt file
    for line in f:                                              # iterating through SYNSET        
        x = line.split()                                        # splitting lines
        x[3] = x[3].split(':')                                  # [3] is the synonyms
        SYNSET.append(x)                                        # append to SYNSET

import stanza                                                   # stanza is a library for natural language processing
stanza.download('hi', processors='tokenize,pos,lemma')          # download the stanza library for Hindi NLP

import csv                                                      # csv is a library for reading and writing csv files
dataset = ""                                                    # dataset is a string

for row in rows:                                                # iterating through rows
    dataset+=row[1]                                             # appending to dataset

verbs_content = []                                              # verbs_content is a list of lists
nlp = stanza.Pipeline('hi',processors='tokenize,pos,lemma')     # nlp is a pipeline for processing text
# pos = open('hindi_pos.txt','w')                                 # opening hindi_pos.txt in write mode
doc = nlp(dataset)                                              # doc is a document object
for sentence in doc.sentences:                                  # iterating through sentences
     for word in sentence.words:                                # iterating through words
         if word.upos == 'VERB':                                # if word is a verb
             verbs_content.append(word.text)                    # append to verbs_content

strongly_negative_words = []                                    # strongly_negative_words is a list
weakly_negative_words = []                                      # weakly_negative_words is a list
for line in SUBJCLUE:                                           # iterating through SUBJCLUE
    totalscore = float(line[2]) - float(line[3])                # calculating total score
    if(totalscore < -0.25):                                     # if total score is less than -0.35
      for word in line[4]:                                      # iterating through words in line[4]
        strongly_negative_words.append(word)                    # append to strongly_negative_words
    elif totalscore < 0:                                        # if total score is less than 0
      for word in line[4]:                                      # iterating through words in line[4]
        weakly_negative_words.append(word)                      # append to weakly_negative_words
        
def Getsynset(word):                                            # Getsynset is a function
    syn = []                                                    # syn is a list
    flag=0                                                      # flag is a variable
    syn.append(word)                                            # appending word to syn
    for line in SYNSET:                                         # iterating through SYNSET
        if(line[1]=="03"):                                      # if line[1] is equal to 03
            for verb in line[3]:                                # iterating through verbs in line[3]
                if(word == verb):                               # if word is equal to verb
                    flag = 1                                    # flag is set to 1
                    break                                       # break
            if(flag):                                           # if flag is set to 1
                syn = line[3]                                   # syn is set to line[3]
                break                                           # break
    return syn                                                  # return syn

s = {}                                                          # s is a dictionary
hlex = []                                                       # hlex is a list

slist = ["लड़ना" , "मारना" , "लूटना" , "पीटना" , "कूटना" , "भेदभाव" ,"फोड़ना", "तोड़ना", "उखाड़ना" ]    # slist is a list of verbs
for word in slist:                                                                              # iterating through slist
  hlex.append(word)                                                                             # appending to hlex
for word in slist:                                                                              # iterating through slist
    s = Getsynset(word)                                                                         # s is set to Getsynset
    for verb1 in s:                                                                             # iterating through s
        if verb1 in verbs_content:                                                              # if verb1 is in verbs_content
            hlex.append(verb1)                                                                  # appending to hlex


HBox(children=(FloatProgress(value=0.0, description='Downloading https://raw.githubusercontent.com/stanfordnlp…

2021-07-29 18:03:09 INFO: Downloading these customized packages for language: hi (Hindi)...
| Processor | Package |
-----------------------
| tokenize  | hdtb    |
| pos       | hdtb    |
| lemma     | hdtb    |
| pretrain  | hdtb    |

2021-07-29 18:03:09 INFO: File exists: /root/stanza_resources/hi/tokenize/hdtb.pt.
2021-07-29 18:03:09 INFO: File exists: /root/stanza_resources/hi/pos/hdtb.pt.
2021-07-29 18:03:09 INFO: File exists: /root/stanza_resources/hi/lemma/hdtb.pt.





2021-07-29 18:03:10 INFO: File exists: /root/stanza_resources/hi/pretrain/hdtb.pt.
2021-07-29 18:03:10 INFO: Finished downloading models and saved to /root/stanza_resources.
2021-07-29 18:03:10 INFO: Loading these models for language: hi (Hindi):
| Processor | Package |
-----------------------
| tokenize  | hdtb    |
| pos       | hdtb    |
| lemma     | hdtb    |

2021-07-29 18:03:10 INFO: Use device: cpu
2021-07-29 18:03:10 INFO: Loading: tokenize
2021-07-29 18:03:10 INFO: Loading: pos
2021-07-29 18:03:10 INFO: Loading: lemma
2021-07-29 18:03:10 INFO: Done loading processors!


In [21]:
# open themenouns.txt in read
themed_nouns = open('themenouns.txt','r')
themenouns = []                                 # list of theme nouns
for line in themed_nouns:                       # read the file line by line
    themenouns.append(line.rstrip('\n'))        # append the theme nouns to the list
print(themenouns)                               # printing the theme nouns list

['बीजेपी ', 'मोदी ', 'माओवादियों ', 'इस्लाम ', 'धमकी ', 'सुरक्षा ', 'धर्म ', 'साले ', 'कुत्ते ', 'कुतिया', 'कुते ', 'कुत्ती', 'कुत्तो', 'कमीना', 'कमीनी', 'साला', 'साली', 'हरामी', 'हरामखोर', 'बहनचोद', 'मादरचोद', 'चूतिया', 'चूत', 'चुत', 'टट्टी', 'नाजायज', 'झांट', 'सुअर', 'बेटीचोद', 'गांड', 'भोसड़ी', 'रन्डी', 'रांड', 'भड़वे', 'लौड़ा', 'लोडे', 'लवड़ा', 'चोर ', 'औलाद ', 'चीन ', 'औकात ', 'चुनौती', 'कश्मीर ', 'ज़ुल्म ', 'मरकज ', 'भारत', 'आतंकवाद', 'इस्लामिक', 'तालिबानी', 'हिन्दू ', 'अर्नब ', 'गद्दारों ', 'कलंकित ', 'तोड़फोड़ ', 'शिवसेना ', 'मंदिर ', 'राम ', 'हिन्दुओं ', 'शूद्र ', 'मुसलमान ', 'विपक्षी ', 'आग ', 'कॉंग्रेस ', 'आतंकवादी ', 'डायन ', 'पलटू ', 'फेंकूँ ', 'पाकिस्तान ', 'जिंदाबाद ', 'आतंकी ', 'आतंकी ', 'आतंकियों ', 'हिंदुस्तान ', 'हिन्दुओं', 'नेता', 'गुलाम ', 'पीओके ', 'आरएसएस ', 'भैंसियो ', 'चमचों ', 'पिल्ला ', 'गधे ', 'तबाह ', 'मुसलमान ', 'मुसलमानों ', 'मौलवी ', 'धर्म ']


# Hate speech Detection Algorithm

In [22]:
print(strongly_negative_words)  # printing the strongly negative words
print(weakly_negative_words)    # printing the weakly negative words
print(hlex)                     # printing the hlex words
print(themenouns)               # printing the themenouns 

['मृत', 'दुर्भाग्यशाली', 'अभागा', 'बदनसीब', 'भाग्यहीन', 'मनहूस', 'बदकिस्मत', 'मंदभाग्य', 'बदक़िस्मत', 'दईमारा', 'कमबख्त', 'कमबख़्त', 'अधन्य', 'अभागी', 'आवासहीन', 'आश्रयहीन', 'गृहहीन', 'गृहविहीन', 'बेघर', 'बेघरबार', 'अगतिक', 'अगेह', 'अनिकेत', 'बदबूदार', 'दुर्गंधपूर्ण', 'दुर्गंधयुक्त', 'दुर्गंधित', 'ढीला', 'अश्लिष्ट', 'असंयुक्त', 'असंयोजित', 'असंबद्ध', 'अलग', 'अजुड़ा', 'अजोड़', 'पृथक्', 'जुदा', 'पृथक', 'अपृक्त', 'पराधीन', 'गुलाम', 'परतंत्र', 'अन्याधीन', 'अपरवश', 'परवश', 'अवश', 'अबस', 'ढीला', 'जड़', 'अचैतन्य', 'जड़त्वयुक्त', 'स्थूल', 'अजैव', 'भौतिक', 'अचेतन', 'चेतनारहित', 'अजीव', 'अनात्म', 'आत्मारहित', 'अफल', 'अफलित', 'फलहीन', 'फलरहित', 'फलविहीन', 'निस्संतान', 'निःसंतान', 'बेऔलाद', 'संतानहीन', 'संतानरहित', 'अऊत', 'अनपत्य', 'भली-भाँति', 'भली_भाँति', 'भरपूर', 'भलीभाँति', 'भली-भांति', 'भली_भांति', 'भलीभांति', 'पंखहीन', 'पक्षरहित', 'अपक्ष', 'अपच्छी', 'अपत्र', 'अतकनीकी', 'ग़ैरतकनीकी', 'गैरतकनीकी', 'तकनीकहीन', 'प्राविधिहीन', 'दुर्गुणी', 'अगुणी', 'अपगुणी', 'ऐबी', 'खोटा', 'विपरीत', 'प्रतिकूल', 'ख

## Calculating Scores without Subjective Analysis

### Only Semantic feature set

In [23]:
for row in rows:                                                      # Iterate over all rows
    strongcount = 0                                                   # Initialize strong count
    hlexcount = 0                                                     # Initialize hlex count
    weakcount = 0                                                     # Initialize weak count
    themecount = 0                                                    # Initialize theme count                  
    if any([word in row[1] for word in strongly_negative_words]):     # If any of the strongly negative words are in the tweet
      strongcount += 1                                                # Increment strong count
    # if any([word in row[1] for word in hlex]):                        # If any of the hlex words are in the tweet
    #   hlexcount += 1                                                  # Increment hlex count
    if any([word in row[1] for word in weakly_negative_words]):       # If any of the weakly negative words are in the tweet
      weakcount += 1                                                  # Increment weak count    
    # if any([word in row[1] for word in themenouns]):                  # If any of the theme nouns are in the tweet
    #   themecount += 1                                                 # Increment theme count

    if strongcount >= 2:                                              # If strong count is greater than or equal to 2
        row.append("strongly hateful")                               # Append strongly hateful to the row 
    elif strongcount == 1:                                            # Else if strong count is 1
      if hlexcount >= 1 or themecount >= 1:                           # If hlex count is 1 or theme count is 1
        row.append("strongly hateful")                                # Append strongly hateful to the row
      else:                                                           # Else
        row.append("weakly hateful")                                  # Append weakly hateful to the row 
    elif strongcount == 0:                                            # Else if strong count is 0
      if themecount >= 1 and hlexcount >= 1:                          # If theme count is 1 and hlex count is 1
        row.append("strongly hateful")                                # Append strongly hateful to the row
      elif themecount >=1 and weakcount >= 1:                         # Else if theme count is 1 and weak count is 1
        row.append("weakly hateful")                                  # Append weakly hateful to the row
      elif hlexcount == 1:                                            # Else if hlex count is 1
        row.append("weakly hateful")                                  # Append weakly hateful to the row
      else:                                                           # Else
        row.append("No Hate")                                         # Append No Hate to the row


# total rows = toal number of rows
total_rows = [row for row in rows]

# no_hate_rows = number of rows that are marked to have no hate
no_hate_rows = [row for row in rows if row[4] == "No Hate"]
# correct_no_hate_rows = number of rows that have no hate speech and are correctly marked
correct_no_hate_rows = [row for row in no_hate_rows if row[4] == "No Hate" and row[2] == "non-hostile"]
# weak_hate_rows = number of rows that are marked to have weak hate
weak_hate_rows = [row for row in rows if row[4] == "weakly hateful"]
# correct_weak_hate_rows = number of rows that have weak hate speech and are correctly marked
correct_weak_hate_rows = [row for row in weak_hate_rows if row[4] == "weakly hateful" and (row[2] == "fake" or row[2] == "defamation")]
# strong_hate_rows = number of rows that are marked to have strong hate
strong_hate_rows = [row for row in rows if row[4] == "strongly hateful"]
# correct_strong_hate_rows = number of rows that have strong hate speech and are correctly marked
correct_strong_hate_rows = [row for row in strong_hate_rows if row[4] == "strongly hateful" and row[2] != "non-hostile" and row[2] != "fake" and row[2] != "defamation"]
# false negatives in the no hate list
false_neg_no_hate = [row for row in no_hate_rows if row[2] == "non-hostile" and row[4] != "No Hate"]
# false negatives in the weak hate list
false_neg_weak_hate = [row for row in weak_hate_rows if row[2] == "fake" or row[2] == "defamation" and row[4] != "weakly hateful"]
# false negatives in the strong hate list
false_neg_strong_hate = [row for row in strong_hate_rows if row[2] != "non-hostile" and row[2] != "fake" and row[2] != "defamation" and row[4] != "strongly hateful"]

# calculating precision
precision = (len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows))/(len(no_hate_rows)+len(strong_hate_rows)+len(weak_hate_rows))
# calculating recall
recall = (len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows))/(len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows)+len(false_neg_no_hate)+len(false_neg_strong_hate)+len(false_neg_weak_hate))
# calculating F1 score
f1 = 2*precision*recall/(precision+recall)

print("Total no. of rows: {}".format(len(total_rows)))                  # total no. of rows
print("No Hate: {}".format(len(no_hate_rows)))                          # no hate rows
print("Actual no hate: {}".format(len(correct_no_hate_rows)))           # actual no hate rows
print("Weak Hate: {}".format(len(weak_hate_rows)))                      # weak hate rows      
print("Actual weak hate: {}".format(len(correct_weak_hate_rows)))       # actual weak hate rows
print("Strong Hate: {}".format(len(strong_hate_rows)))                  # strong hate rows
print("Actual strong hate: {}".format(len(correct_strong_hate_rows)))   # actual strong hate rows
# print precision
print("Precision: {}".format(precision))
# print recall
print("Recall: {}".format(recall))
# print f1
print("F-score: {}".format(f1))

Total no. of rows: 811
No Hate: 357
Actual no hate: 207
Weak Hate: 454
Actual weak hate: 112
Strong Hate: 0
Actual strong hate: 0
Precision: 0.3933415536374846
Recall: 0.7799511002444988
F-score: 0.5229508196721312


### Semantic + Hate Lexicon

In [24]:
for row in rows:                                                      # Iterate over all rows
    strongcount = 0                                                   # Initialize strong count
    hlexcount = 0                                                     # Initialize hlex count
    weakcount = 0                                                     # Initialize weak count
    themecount = 0                                                    # Initialize theme count                  
    if any([word in row[1] for word in strongly_negative_words]):     # If any of the strongly negative words are in the tweet
      strongcount += 1                                                # Increment strong count
    if any([word in row[1] for word in hlex]):                        # If any of the hlex words are in the tweet
      hlexcount += 1                                                  # Increment hlex count
    if any([word in row[1] for word in weakly_negative_words]):       # If any of the weakly negative words are in the tweet
      weakcount += 1                                                  # Increment weak count    
    # if any([word in row[1] for word in themenouns]):                  # If any of the theme nouns are in the tweet
    #   themecount += 1                                                 # Increment theme count

    if strongcount >= 2:                                              # If strong count is greater than or equal to 2
        row[4] = "strongly hateful"                                # Append strongly hateful to the row 
    elif strongcount == 1:                                            # Else if strong count is 1
      if hlexcount >= 1 or themecount >= 1:                           # If hlex count is 1 or theme count is 1
        row[4] = "strongly hateful"                                # Append strongly hateful to the row
      else:                                                           # Else
        row[4] = "weakly hateful"                                  # Append weakly hateful to the row 
    elif strongcount == 0:                                            # Else if strong count is 0
      if themecount >= 1 and hlexcount >= 1:                          # If theme count is 1 and hlex count is 1
        row[4] = "strongly hateful"                                # Append strongly hateful to the row
      elif themecount >=1 and weakcount >= 1:                         # Else if theme count is 1 and weak count is 1
        row[4] = "weakly hateful"                                  # Append weakly hateful to the row
      elif hlexcount == 1:                                            # Else if hlex count is 1
        row[4] = "weakly hateful"                                  # Append weakly hateful to the row
      else:                                                           # Else
        row[4] = "No Hate"                                        # Append No Hate to the row


# total rows = toal number of rows
total_rows = [row for row in rows]

# no_hate_rows = number of rows that are marked to have no hate
no_hate_rows = [row for row in rows if row[4] == "No Hate"]
# correct_no_hate_rows = number of rows that have no hate speech and are correctly marked
correct_no_hate_rows = [row for row in no_hate_rows if row[4] == "No Hate" and row[2] == "non-hostile"]
# weak_hate_rows = number of rows that are marked to have weak hate
weak_hate_rows = [row for row in rows if row[4] == "weakly hateful"]
# correct_weak_hate_rows = number of rows that have weak hate speech and are correctly marked
correct_weak_hate_rows = [row for row in weak_hate_rows if row[4] == "weakly hateful" and (row[2] == "fake" or row[2] == "defamation")]
# strong_hate_rows = number of rows that are marked to have strong hate
strong_hate_rows = [row for row in rows if row[4] == "strongly hateful"]
# correct_strong_hate_rows = number of rows that have strong hate speech and are correctly marked
correct_strong_hate_rows = [row for row in strong_hate_rows if row[4] == "strongly hateful" and row[2] != "non-hostile" and row[2] != "fake" and row[2] != "defamation"]
# false negatives in the no hate list
false_neg_no_hate = [row for row in no_hate_rows if row[2] == "non-hostile" and row[4] != "No Hate"]
# false negatives in the weak hate list
false_neg_weak_hate = [row for row in weak_hate_rows if row[2] == "fake" or row[2] == "defamation" and row[4] != "weakly hateful"]
# false negatives in the strong hate list
false_neg_strong_hate = [row for row in strong_hate_rows if row[2] != "non-hostile" and row[2] != "fake" and row[2] != "defamation" and row[4] != "strongly hateful"]

# calculating precision
precision = (len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows))/(len(no_hate_rows)+len(strong_hate_rows)+len(weak_hate_rows))
# calculating recall
recall = (len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows))/(len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows)+len(false_neg_no_hate)+len(false_neg_strong_hate)+len(false_neg_weak_hate))
# calculating F1 score
f1 = 2*precision*recall/(precision+recall)

print("Total no. of rows: {}".format(len(total_rows)))                  # total no. of rows
print("No Hate: {}".format(len(no_hate_rows)))                          # no hate rows
print("Actual no hate: {}".format(len(correct_no_hate_rows)))           # actual no hate rows
print("Weak Hate: {}".format(len(weak_hate_rows)))                      # weak hate rows      
print("Actual weak hate: {}".format(len(correct_weak_hate_rows)))       # actual weak hate rows
print("Strong Hate: {}".format(len(strong_hate_rows)))                  # strong hate rows
print("Actual strong hate: {}".format(len(correct_strong_hate_rows)))   # actual strong hate rows
# print precision
print("Precision: {}".format(precision))
# print recall
print("Recall: {}".format(recall))
# print f1
print("F-score: {}".format(f1))

Total no. of rows: 811
No Hate: 355
Actual no hate: 206
Weak Hate: 455
Actual weak hate: 112
Strong Hate: 1
Actual strong hate: 1
Precision: 0.3933415536374846
Recall: 0.7799511002444988
F-score: 0.5229508196721312


### Semantic + Hate Lexicon + Thematic Nouns

In [25]:
for row in rows:                                                      # Iterate over all rows
    strongcount = 0                                                   # Initialize strong count
    hlexcount = 0                                                     # Initialize hlex count
    weakcount = 0                                                     # Initialize weak count
    themecount = 0                                                    # Initialize theme count                  
    if any([word in row[1] for word in strongly_negative_words]):     # If any of the strongly negative words are in the tweet
      strongcount += 1                                                # Increment strong count
    if any([word in row[1] for word in hlex]):                        # If any of the hlex words are in the tweet
      hlexcount += 1                                                  # Increment hlex count
    if any([word in row[1] for word in weakly_negative_words]):       # If any of the weakly negative words are in the tweet
      weakcount += 1                                                  # Increment weak count    
    if any([word in row[1] for word in themenouns]):                  # If any of the theme nouns are in the tweet
      themecount += 1                                                 # Increment theme count

    if strongcount >= 2:                                              # If strong count is greater than or equal to 2
        row[4] = "strongly hateful"                                # Append strongly hateful to the row 
    elif strongcount == 1:                                            # Else if strong count is 1
      if hlexcount >= 1 or themecount >= 1:                           # If hlex count is 1 or theme count is 1
        row[4] = "strongly hateful"                                # Append strongly hateful to the row
      else:                                                           # Else
        row[4] = "weakly hateful"                                  # Append weakly hateful to the row 
    elif strongcount == 0:                                            # Else if strong count is 0
      if themecount >= 1 and hlexcount >= 1:                          # If theme count is 1 and hlex count is 1
        row[4] = "strongly hateful"                                # Append strongly hateful to the row
      elif themecount >=1 and weakcount >= 1:                         # Else if theme count is 1 and weak count is 1
        row[4] = "weakly hateful"                                  # Append weakly hateful to the row
      elif hlexcount == 1:                                            # Else if hlex count is 1
        row[4] = "weakly hateful"                                 # Append weakly hateful to the row
      else:                                                           # Else
        row[4] = "No Hate"                                        # Append No Hate to the row


# total rows = toal number of rows
total_rows = [row for row in rows]

# no_hate_rows = number of rows that are marked to have no hate
no_hate_rows = [row for row in rows if row[4] == "No Hate"]
# correct_no_hate_rows = number of rows that have no hate speech and are correctly marked
correct_no_hate_rows = [row for row in no_hate_rows if row[4] == "No Hate" and row[2] == "non-hostile"]
# weak_hate_rows = number of rows that are marked to have weak hate
weak_hate_rows = [row for row in rows if row[4] == "weakly hateful"]
# correct_weak_hate_rows = number of rows that have weak hate speech and are correctly marked
correct_weak_hate_rows = [row for row in weak_hate_rows if row[4] == "weakly hateful" and (row[2] == "fake" or row[2] == "defamation")]
# strong_hate_rows = number of rows that are marked to have strong hate
strong_hate_rows = [row for row in rows if row[4] == "strongly hateful"]
# correct_strong_hate_rows = number of rows that have strong hate speech and are correctly marked
correct_strong_hate_rows = [row for row in strong_hate_rows if row[4] == "strongly hateful" and row[2] != "non-hostile" and row[2] != "fake" and row[2] != "defamation"]
# false negatives in the no hate list
false_neg_no_hate = [row for row in no_hate_rows if row[2] == "non-hostile" and row[4] != "No Hate"]
# false negatives in the weak hate list
false_neg_weak_hate = [row for row in weak_hate_rows if row[2] == "fake" or row[2] == "defamation" and row[4] != "weakly hateful"]
# false negatives in the strong hate list
false_neg_strong_hate = [row for row in strong_hate_rows if row[2] != "non-hostile" and row[2] != "fake" and row[2] != "defamation" and row[4] != "strongly hateful"]

# calculating precision
precision = (len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows))/(len(no_hate_rows)+len(strong_hate_rows)+len(weak_hate_rows))
# calculating recall
recall = (len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows))/(len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows)+len(false_neg_no_hate)+len(false_neg_strong_hate)+len(false_neg_weak_hate))
# calculating F1 score
f1 = 2*precision*recall/(precision+recall)

print("Total no. of rows: {}".format(len(total_rows)))                  # total no. of rows
print("No Hate: {}".format(len(no_hate_rows)))                          # no hate rows
print("Actual no hate: {}".format(len(correct_no_hate_rows)))           # actual no hate rows
print("Weak Hate: {}".format(len(weak_hate_rows)))                      # weak hate rows      
print("Actual weak hate: {}".format(len(correct_weak_hate_rows)))       # actual weak hate rows
print("Strong Hate: {}".format(len(strong_hate_rows)))                  # strong hate rows
print("Actual strong hate: {}".format(len(correct_strong_hate_rows)))   # actual strong hate rows
# print precision
print("Precision: {}".format(precision))
# print recall
print("Recall: {}".format(recall))
# print f1
print("F-score: {}".format(f1))

Total no. of rows: 811
No Hate: 293
Actual no hate: 185
Weak Hate: 344
Actual weak hate: 85
Strong Hate: 174
Actual strong hate: 75
Precision: 0.4254007398273736
Recall: 0.843520782396088
F-score: 0.5655737704918032


## Calculating Scores with Subjective Analysis

In [26]:
counter = 0                             # Counter for the number of tweets
subj_rows = []                          # List of all the subjective tweets
for row in rows:                        # Iterate through each row
  if row[3] <= -0.5 or row[3] >= 1:     # subjective sentence condition
    subj_rows.append(row)               # Append the row to the list
    counter += 1

print("Number of Subjective Sentences: ")
print(counter)                          # Print the number of subjective tweets

Number of Subjective Sentences: 
355


### Semantic feature set

In [27]:
for row in rows:                                                    # Iterate over all rows
  if row[3] <= -0.5 or row[3] >= 1:                                 # If the score is over -0.5 or 0.5
    strongcount = 0                                                 # Set strongcount to 0
    hlexcount = 0                                                   # Set hlexcount to 0
    weakcount = 0                                                   # Set weakcount to 0                   
    themecount = 0                                                  # Set themecount to 0
    if any([word in row[1] for word in strongly_negative_words]):   # If any of the strongly negative words are in the tweet
      strongcount += 1                                              # Add 1 to strongcount  
    # if any([word in row[1] for word in hlex]):                      # If any of the hlex words are in the tweet
    #   hlexcount += 1                                                # Add 1 to hlexcount
    if any([word in row[1] for word in weakly_negative_words]):     # If any of the weakly negative words are in the tweet
      weakcount += 1                                                # Add 1 to weakcount                  
    # if any([word in row[1] for word in themenouns]):                # If any of the themenouns words are in the tweet
    #   themecount += 1                                               # Add 1 to themecount

    if strongcount >= 2:                                            # If strongcount is greater than or equal to 2
        row.append("strongly hateful")                              # Append strongly hate to the row
    elif strongcount == 1:                                          # Else if strongcount is equal to 1
      if hlexcount >= 1 or themecount >= 1:                         # If hlexcount is greater than or equal to 1 or themecount is greater than or equal to 1
        row.append("strongly hateful")                              # Append strongly hate to the row               
      else:                                                         # Else  
        row.append("weakly hateful")                                # Append weakly hate to the row
    elif strongcount == 0:                                          # Else if strongcount is equal to 0
      if themecount >= 1 and hlexcount >= 1:                        # If themecount is greater than or equal to 1 and hlexcount is greater than or equal to 1
        row.append("strongly hateful")                              # Append strongly hate to the row 
      elif themecount >=1 and weakcount >= 1:                       # Else if themecount is greater than or equal to 1 and weakcount is greater than or equal to 1
        row.append("weakly hateful")                                # Append weakly hate to the row
      elif hlexcount == 1:                                          # Else if hlexcount is equal to 1
        row.append("weakly hateful")                                # Append weakly hate to the row
      else:                                                         # Else                    
        row.append("No Hate")                                       # Append No Hate to the row
  else:                                                             # Else
     row.append("No Hate")                                          # Append No Hate to the row


total_rows = [row for row in rows]

no_hate_rows = [row for row in rows if row[5] == "No Hate"]
correct_no_hate_rows = [row for row in no_hate_rows if row[5] == "No Hate" and row[2] == "non-hostile"]
weak_hate_rows = [row for row in rows if row[5] == "weakly hateful"]
correct_weak_hate_rows = [row for row in weak_hate_rows if row[5] == "weakly hateful" and (row[2] == "fake" or row[2] == "defamation")]
strong_hate_rows = [row for row in rows if row[5] == "strongly hateful"]
correct_strong_hate_rows = [row for row in strong_hate_rows if row[5] == "strongly hateful" and row[2] != "non-hostile" and row[2] != "fake" and row[2] != "defamation"]

false_neg_no_hate = [row for row in no_hate_rows if row[2] == "non-hostile" and row[5] != "No Hate"]
false_neg_weak_hate = [row for row in weak_hate_rows if row[2] == "fake" or row[2] == "defamation" and row[5] != "weakly hateful"]
false_neg_strong_hate = [row for row in strong_hate_rows if row[2] != "non-hostile" and row[2] != "fake" and row[2] != "defamation" and row[5] != "strongly hateful"]

precision = (len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows))/(len(no_hate_rows)+len(strong_hate_rows)+len(weak_hate_rows))
recall = (len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows))/(len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows)+len(false_neg_no_hate)+len(false_neg_strong_hate)+len(false_neg_weak_hate))
f1 = 2*precision*recall/(precision+recall)

print("Total no. of rows: {}".format(len(total_rows)))
print("No Hate: {}".format(len(no_hate_rows)))
print("Actual no hate: {}".format(len(correct_no_hate_rows)))
print("Weak Hate: {}".format(len(weak_hate_rows)))
print("Actual weak hate: {}".format(len(correct_weak_hate_rows)))
print("Strong Hate: {}".format(len(strong_hate_rows)))
print("Actual strong hate: {}".format(len(correct_strong_hate_rows)))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
print("F-score: {}".format(f1))

Total no. of rows: 811
No Hate: 479
Actual no hate: 282
Weak Hate: 332
Actual weak hate: 86
Strong Hate: 0
Actual strong hate: 0
Precision: 0.45376078914919854
Recall: 0.8498845265588915
F-score: 0.5916398713826366


### Semantic + Hate Lexicon 

In [28]:
for row in rows:                                                    # Iterate over all rows
  if row[3] <= -0.5 or row[3] >= 1:                                 # If the score is over -0.5 or 0.5
    strongcount = 0                                                 # Set strongcount to 0
    hlexcount = 0                                                   # Set hlexcount to 0
    weakcount = 0                                                   # Set weakcount to 0                   
    themecount = 0                                                  # Set themecount to 0
    if any([word in row[1] for word in strongly_negative_words]):   # If any of the strongly negative words are in the tweet
      strongcount += 1                                              # Add 1 to strongcount  
    if any([word in row[1] for word in hlex]):                      # If any of the hlex words are in the tweet
      hlexcount += 1                                                # Add 1 to hlexcount
    if any([word in row[1] for word in weakly_negative_words]):     # If any of the weakly negative words are in the tweet
      weakcount += 1                                                # Add 1 to weakcount                  
    # if any([word in row[1] for word in themenouns]):                # If any of the themenouns words are in the tweet
    #   themecount += 1                                               # Add 1 to themecount

    if strongcount >= 2:                                              # If strong count is greater than or equal to 2
        row[5] = "strongly hateful"                                # Append strongly hateful to the row 
    elif strongcount == 1:                                            # Else if strong count is 1
      if hlexcount >= 1 or themecount >= 1:                           # If hlex count is 1 or theme count is 1
        row[5] = "strongly hateful"                                # Append strongly hateful to the row
      else:                                                           # Else
        row[5] = "weakly hateful"                                  # Append weakly hateful to the row 
    elif strongcount == 0:                                            # Else if strong count is 0
      if themecount >= 1 and hlexcount >= 1:                          # If theme count is 1 and hlex count is 1
        row[5] = "strongly hateful"                                # Append strongly hateful to the row
      elif themecount >=1 and weakcount >= 1:                         # Else if theme count is 1 and weak count is 1
        row[5] = "weakly hateful"                                  # Append weakly hateful to the row
      elif hlexcount == 1:                                            # Else if hlex count is 1
        row[5] = "weakly hateful"                                 # Append weakly hateful to the row
      else:                                                           # Else
        row[5] = "No Hate"                                        # Append No Hate to the row


total_rows = [row for row in rows]

no_hate_rows = [row for row in rows if row[5] == "No Hate"]
correct_no_hate_rows = [row for row in no_hate_rows if row[5] == "No Hate" and row[2] == "non-hostile"]
weak_hate_rows = [row for row in rows if row[5] == "weakly hateful"]
correct_weak_hate_rows = [row for row in weak_hate_rows if row[5] == "weakly hateful" and (row[2] == "fake" or row[2] == "defamation")]
strong_hate_rows = [row for row in rows if row[5] == "strongly hateful"]
correct_strong_hate_rows = [row for row in strong_hate_rows if row[5] == "strongly hateful" and row[2] != "non-hostile" and row[2] != "fake" and row[2] != "defamation"]

false_neg_no_hate = [row for row in no_hate_rows if row[2] == "non-hostile" and row[5] != "No Hate"]
false_neg_weak_hate = [row for row in weak_hate_rows if row[2] == "fake" or row[2] == "defamation" and row[5] != "weakly hateful"]
false_neg_strong_hate = [row for row in strong_hate_rows if row[2] != "non-hostile" and row[2] != "fake" and row[2] != "defamation" and row[5] != "strongly hateful"]

precision = (len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows))/(len(no_hate_rows)+len(strong_hate_rows)+len(weak_hate_rows))
recall = (len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows))/(len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows)+len(false_neg_no_hate)+len(false_neg_strong_hate)+len(false_neg_weak_hate))
f1 = 2*precision*recall/(precision+recall)

print("Total no. of rows: {}".format(len(total_rows)))
print("No Hate: {}".format(len(no_hate_rows)))
print("Actual no hate: {}".format(len(correct_no_hate_rows)))
print("Weak Hate: {}".format(len(weak_hate_rows)))
print("Actual weak hate: {}".format(len(correct_weak_hate_rows)))
print("Strong Hate: {}".format(len(strong_hate_rows)))
print("Actual strong hate: {}".format(len(correct_strong_hate_rows)))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
print("F-score: {}".format(f1))

Total no. of rows: 811
No Hate: 479
Actual no hate: 282
Weak Hate: 331
Actual weak hate: 86
Strong Hate: 1
Actual strong hate: 1
Precision: 0.45499383477188654
Recall: 0.8502304147465438
F-score: 0.5927710843373494


### Semantic + Hate Lexicon + Thematic Nouns

In [29]:
for row in rows:                                                    # Iterate over all rows
  if row[3] <= -0.5 or row[3] >= 1:                                 # If the score is over -0.5 or 0.5
    strongcount = 0                                                 # Set strongcount to 0
    hlexcount = 0                                                   # Set hlexcount to 0
    weakcount = 0                                                   # Set weakcount to 0                   
    themecount = 0                                                  # Set themecount to 0
    if any([word in row[1] for word in strongly_negative_words]):   # If any of the strongly negative words are in the tweet
      strongcount += 1                                              # Add 1 to strongcount  
    if any([word in row[1] for word in hlex]):                      # If any of the hlex words are in the tweet
      hlexcount += 1                                                # Add 1 to hlexcount
    if any([word in row[1] for word in weakly_negative_words]):     # If any of the weakly negative words are in the tweet
      weakcount += 1                                                # Add 1 to weakcount                  
    if any([word in row[1] for word in themenouns]):                # If any of the themenouns words are in the tweet
      themecount += 1                                               # Add 1 to themecount

    if strongcount >= 2:                                              # If strong count is greater than or equal to 2
        row[5] = "strongly hateful"                                # Append strongly hateful to the row 
    elif strongcount == 1:                                            # Else if strong count is 1
      if hlexcount >= 1 or themecount >= 1:                           # If hlex count is 1 or theme count is 1
        row[5] = "strongly hateful"                                # Append strongly hateful to the row
      else:                                                           # Else
        row[5] = "weakly hateful"                                  # Append weakly hateful to the row 
    elif strongcount == 0:                                            # Else if strong count is 0
      if themecount >= 1 and hlexcount >= 1:                          # If theme count is 1 and hlex count is 1
        row[5] = "strongly hateful"                                # Append strongly hateful to the row
      elif themecount >=1 and weakcount >= 1:                         # Else if theme count is 1 and weak count is 1
        row[5] = "weakly hateful"                                  # Append weakly hateful to the row
      elif hlexcount == 1:                                            # Else if hlex count is 1
        row[5] = "weakly hateful"                                 # Append weakly hateful to the row
      else:                                                           # Else
        row[5] = "No Hate"                                        # Append No Hate to the row


total_rows = [row for row in rows]

no_hate_rows = [row for row in rows if row[5] == "No Hate"]
correct_no_hate_rows = [row for row in no_hate_rows if row[5] == "No Hate" and row[2] == "non-hostile"]
weak_hate_rows = [row for row in rows if row[5] == "weakly hateful"]
correct_weak_hate_rows = [row for row in weak_hate_rows if row[5] == "weakly hateful" and (row[2] == "fake" or row[2] == "defamation")]
strong_hate_rows = [row for row in rows if row[5] == "strongly hateful"]
correct_strong_hate_rows = [row for row in strong_hate_rows if row[5] == "strongly hateful" and row[2] != "non-hostile" and row[2] != "fake" and row[2] != "defamation"]

false_neg_no_hate = [row for row in no_hate_rows if row[2] == "non-hostile" and row[5] != "No Hate"]
false_neg_weak_hate = [row for row in weak_hate_rows if row[2] == "fake" or row[2] == "defamation" and row[5] != "weakly hateful"]
false_neg_strong_hate = [row for row in strong_hate_rows if row[2] != "non-hostile" and row[2] != "fake" and row[2] != "defamation" and row[5] != "strongly hateful"]

precision = (len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows))/(len(no_hate_rows)+len(strong_hate_rows)+len(weak_hate_rows))
recall = (len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows))/(len(correct_no_hate_rows)+len(correct_strong_hate_rows)+len(correct_weak_hate_rows)+len(false_neg_no_hate)+len(false_neg_strong_hate)+len(false_neg_weak_hate))
f1 = 2*precision*recall/(precision+recall)

print("Total no. of rows: {}".format(len(total_rows)))
print("No Hate: {}".format(len(no_hate_rows)))
print("Actual no hate: {}".format(len(correct_no_hate_rows)))
print("Weak Hate: {}".format(len(weak_hate_rows)))
print("Actual weak hate: {}".format(len(correct_weak_hate_rows)))
print("Strong Hate: {}".format(len(strong_hate_rows)))
print("Actual strong hate: {}".format(len(correct_strong_hate_rows)))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
print("F-score: {}".format(f1))

Total no. of rows: 811
No Hate: 473
Actual no hate: 280
Weak Hate: 208
Actual weak hate: 51
Strong Hate: 130
Actual strong hate: 63
Precision: 0.48581997533908755
Recall: 0.9184149184149184
F-score: 0.635483870967742


## Exporting results into results.csv

In [30]:
import csv                                                                  # Importing the csv module

fields = ['Unique ID', 'Post', 'Labels Set', 'Total Score', 'Hate Label' ,'Subjective Hate Label']   # Defining the fields of the csv file
with open("results.csv", 'w') as csvfile:                                   # Opening the file                          
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile) 
        
    # writing the fields 
    csvwriter.writerow(fields) 
        
    # writing the data rows 
    csvwriter.writerows(rows)