In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

import string, re
import nltk
nltk.download('wordnet')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
import spacy

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
tweets_df = pd.read_csv('C:/Users/hp/Desktop/HateSpeech/onlineDatasets/caa-tweets-till-9012020/file.csv')

In [3]:
tweets_df.columns

Index(['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone',
       'user_id', 'username', 'name', 'place', 'tweet', 'mentions', 'urls',
       'photos', 'replies_count', 'retweets_count', 'likes_count', 'hashtags',
       'cashtags', 'link', 'retweet', 'quote_url', 'video', 'near', 'geo',
       'source', 'user_rt_id', 'user_rt', 'retweet_id', 'reply_to',
       'retweet_date', 'translate', 'trans_src', 'trans_dest'],
      dtype='object')

In [4]:
all_tweets = tweets_df[:190000]
caa_tweets = all_tweets.tweet
hashtags = all_tweets.hashtags
date = all_tweets.date
retweets_count = all_tweets.retweets_count
likes_count = all_tweets.likes_count
useful_tweets = list(zip(caa_tweets,date, hashtags, retweets_count, likes_count))

In [5]:
# Function to separate hindi and others

def separate_english_hindi_tweets(tweets):
    etweets = []
    htweets= []
    
    for t, tweet_tup in enumerate(tweets):
        e_flag = 1
        tweet = tweet_tup[0]
        for c in tweet:
            if c == "\n":
                continue
            if ord(c) > 31 and ord(c) < 127:
                continue
            elif ord(c) > 2300 and ord(c) < 2400: # hindi
                e_flag = 0
                htweets.append(tweet_tup)
                break
            else:
                continue
        if e_flag == 1:
            etweets.append(tweet_tup)
            
    return etweets, htweets

In [6]:
english_tweets = []
hindi_tweets = []
english_tweets, hindi_tweets = separate_english_hindi_tweets(useful_tweets)

In [7]:
# Separate English from others

etweets = []
otweets= []
    
for t, etup in enumerate(english_tweets):
    e_flag = 1
    tweet = etup[0]
    for c in tweet:
        if c == "\n":
            continue
        if ord(c) > 31 and ord(c) < 250:
            continue
        elif ord(c) > 8200 and ord(c) < 8400: #special punctuations
            continue
        elif ord(c) > 9000: #smileys
            continue
        else:
#             print(c, ord(c))
            e_flag = 0
            break
    if e_flag == 1:
        etweets.append(etup)
    else:
        otweets.append(etup)

In [8]:
etweets[0]

('Yet another reason why India needs #CAA: \n\nHindus Beaten by Pakistani Police for Hoisting Saffron Flag in Their Own Home. Video Published to Cower Other Hindus into Submission!\n\n https://www.youtube.com/watch?v=lTQxDeBmCyI\xa0‚Ä¶\n@MEAIndia @Swamy39 @blsanthosh @davidfrawleyved @MODIfiedVikas @ShefVaidya',
 '2020-01-08',
 "['#caa']",
 0,
 0)

In [9]:
retweet_sorted_tweets = sorted(etweets, key=lambda x: x[3], reverse=True)

In [10]:
retweet_sorted_tweets[:10]

[('#CAA + #NRC + more this Sunday on @patriotact pic.twitter.com/AIoAub8Fwu',
  '2019-12-20',
  "['#caa', '#nrc']",
  17739,
  42902),
 ('My university üòç\nStudents of #PanjabUniversity in support of #CAA\nChandigarh is not only beautiful, but Nationalist too ‚ù§Ô∏è  pic.twitter.com/NUgngteXur',
  '2019-12-18',
  "['#panjabuniversity', '#caa']",
  9247,
  28077),
 ('Stop watching Hindi movies of the actors, directors, writers, anyone who has supported the violent protests, looting and arson by Muzlims over #CAA. This is ONLY way to teach them a lesson. Starve them of money.',
  '2019-12-18',
  "['#caa']",
  8570,
  21134),
 ('Delhi with #CAA\nDelhi with @narendramodi \nDelhi with @AmitShah pic.twitter.com/pDgdIoZLvh',
  '2019-12-20',
  "['#caa']",
  8561,
  20395),
 ('#CAA is meant to provide fast track citizenship to non-Muslim families from Afghanistan, Pakistan and Bangladesh that have been lynched, raped and persecuted for generations due to their religious beliefs. \n\nWhatever 

In [11]:
likes_sorted_tweets = sorted(etweets, key=lambda x: x[4], reverse=True)

In [12]:
likes_sorted_tweets[:10]

[('#CAA + #NRC + more this Sunday on @patriotact pic.twitter.com/AIoAub8Fwu',
  '2019-12-20',
  "['#caa', '#nrc']",
  17739,
  42902),
 ('My university üòç\nStudents of #PanjabUniversity in support of #CAA\nChandigarh is not only beautiful, but Nationalist too ‚ù§Ô∏è  pic.twitter.com/NUgngteXur',
  '2019-12-18',
  "['#panjabuniversity', '#caa']",
  9247,
  28077),
 ('Stop watching Hindi movies of the actors, directors, writers, anyone who has supported the violent protests, looting and arson by Muzlims over #CAA. This is ONLY way to teach them a lesson. Starve them of money.',
  '2019-12-18',
  "['#caa']",
  8570,
  21134),
 ('India stands strong with Hon @narendramodi ji & Hon @AmitShah ji for solving decades old problem by #CAA & giving new lease of life in Bharat, to our brothers & sisters facing religious persecution in neighbouring countries.\n\n(Siliguri #WestBengal on 24 Dec ‚Äò19)\n#IndiaSupportsCAA pic.twitter.com/EnUURXAWol',
  '2019-12-30',
  "['#caa', '#westbengal', '#indi

In [13]:
# Create a dictionary for counting the number of tweets on that particular date as key-value pair

dates = defaultdict(int)
def count_date_tweets(tweet_tups):
    for tt in tweet_tups:
        dates[tt[1]] += 1

In [14]:
count_date_tweets(etweets)

In [15]:
tweets_per_date = list(dates.values())
dates_for_tweets = list(dates.keys())

In [16]:
# Function to remove urls, hashtags and punctuations

#import re,string

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

In [None]:
for t in str(etweets):
    strip_all_entities(strip_links(t))

In [None]:
print(words[:100])

In [None]:
# Normalization

# split into words
#from nltk.tokenize import word_tokenize
tokens = word_tokenize(str(etweets))

# convert to lower case
tokens = [w.lower() for w in tokens]


# remove punctuation from each word
#import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]

# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]

# filter out stop words
#from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
#print(stop_words)
words = [w for w in words if not w in stop_words]
print(words[:1000])
print('\n')

# stemming of words
#from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
#stemmed = [porter.stem(word) for word in tokens]
stemmed = [porter.stem(word) for word in words]
print(stemmed[:1000])

In [None]:
# Lemmatization using NLTK

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Tokenize: Split the sentence into words
#word_list = nltk.word_tokenize(str(etweets))
#word_list = nltk.word_tokenize(str(words))
word_list = nltk.word_tokenize(str(stemmed))

# Lemmatize list of words and join
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(lemmatized_output[:1000])

In [None]:
# Counting the frequency of words

freqDict=dict()

for tweet in etweets:
   #words=tweet.split()
   for word in words:
        if word not in freqDict:
            freqDict[word] = 1
        else:
            freqDict[word] += 1
print(freqDict)

In [None]:
# NER

#import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp(etweets)

d= dict()
for x in nlp(str(doc)).ents:
    d[str(x)]= x.label_ 

for x,y in d.items():
    if y == 'PERSON':
        print(x,y)