# Overview of Hostile Post Detection in Hindi
- **Goals:** To predict the label set of the set of post collected from Twitter and facebook.

- **Traning data:** post with their label.

- **Tesing data:** set of posts.
- **Types of post:** 
1. *Fake News:* A claim or information that is verified to be not true.
2. *Hate Speech:* A post targeting a specific group of people based on their ethnicity, religious beliefs, geographical belonging, race, etc., with malicious intentions of spreading hate or encouraging violence.
3. *Offensive:* A post containing profanity, impolite, rude, or vulgar language to insult a targeted individual or group.
4. *Defamation:* A mis-information regarding an individual or group.
5. *Non-hostile:* A post without any hostility.



- **dataset.csv:** contains trainng data.

## Exploring dataset

In [1]:
# importing libraries
import pandas as pd     
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth')

ValueError: Must provide an even number of non-keyword arguments

#### load data

In [None]:
data = pd.read_csv("dataset.csv")

#### How many posts do we have in training set

In [None]:
print("We have", data.shape[0], "posts in the training set.")

#### First and last five posts of dataset

In [None]:
data.head()

In [None]:
data.tail()

#### Information about the attributes and tupples.

In [None]:
data.info()

#### removing extra columns

In [None]:
data=data.drop(data.columns[[0,3,4]], axis=1)

#### New information of data 

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.tail()

#### Calculating the null values in each columns

In [None]:
data.isnull().sum()

#### Number of columns and rows in train data set

In [None]:
data.shape
print(data.shape,"is the dimension of the data")

#### Checking for the duplicates in the dataset.

In [None]:
print(len(data['Post'])-len(set(data['Post'])))

#### Calculating the number and percentage of different types of post.

In [None]:
d={}
total=0
for i in data['Labels Set']:
    ar=list(i.split(','))
    for j in ar:
        try:
            d[j]+=1
        except:
            d[j]=1
        total+=1


percentageHolder={}
for i in d:
    percentageHolder[i]=str((d[i]/total)*100)[:5]+'%'



table={'count':d,'percentage':percentageHolder}
print(pd.DataFrame(table))

#### Represntation of label

In [None]:
#pie chart creation

labels = list(d.keys())
values = list(d.values())
plt.figure(figsize=(6,6))
plt.pie(values, labels=labels,autopct="%1.2f%%")
plt.show()

Analysis:


Data is biased towards non-hostile post.

In [None]:
# Bar Plot
plt.bar(d.keys(), d.values(), 0.6, color=['g','g','r','g','g'])
plt.xlabel("Labels", labelpad=14)
plt.ylabel("Frequency", labelpad=14)
sb.set(font_scale=1.4)
plt.xticks(fontsize=14, rotation=90)

Analysis:

1) We observe that non-hostile posts has the highest proportion i.e more than 3000.

In [None]:
data.info()

In [None]:
#PREPROCESSING

In [None]:
#Checking if the duplicate posts have duplicate labels or not

temp=set()
d={}
index=0
for i in data['Post']:
    if i not in temp:
        temp.add(i)
        d[i]=[[data['Labels Set'][index],index]]
    else:
        d[i].append([data['Labels Set'][index],index])
        print(d[i])
    index+=1
        

In [None]:
#removing duplicate rows

postTemp={'Post':[],'Labels Set':[]}
visited=set()
index=0

for i in data['Post']:
    if i not in visited and index!=4970:
        postTemp['Post'].append(i)
        postTemp['Labels Set'].append(data['Labels Set'][index])
        visited.add(i)
    index+=1
        
data=pd.DataFrame.from_dict(postTemp)

data

In [None]:
#removing urls & mentions from the string

data['Post'] = data['Post'].str.replace('http\S+|www.\S+|@\S+', '', case=False)

data

In [None]:
#separating Hashtags
ht=[]


for i in data['Post']:
    temp=[]
    start=0
    for j in i:
        if j=='#':
            start=1
            holder=''
        if j==' ' and start:
            start=0
            temp.append(holder)
            
        elif start==1:
            holder+=j
    ht.append(temp)
            
data['Hashtag']=ht

data['Post'] = data['Post'].str.replace('#\S+', '', case=False)

data

In [None]:
#removing Punctuations

f=open("punctuations.txt","r")
punctuations=set(f.read().split('\n'))
index=0

for i in data['Post']:
    temp=''
    for j in i:
        if j not in punctuations:
            temp+=j
        else:
            temp+=' '
    data['Post'][index]=temp
    index+=1
    
data

In [None]:
#removing stop words 

f=open("stopWords.txt","r")
stopWords=set(f.read().split('\n'))

index=0

for i in data['Post']:
    temp=''
    words=i.split(' ')
    for j in words:
        if j not in stopWords:
            temp+=j
            temp+=' '
    data['Post'][index]=temp
    index+=1

data

In [None]:
#separating emojis

emojis=[]
index=0

for i in data['Post']:
    post=""
    temp=[]
    for j in i:
        if (u'\u0900' <= j <= u'\u097f' or j==' ' or j.isdigit() or j=='\n' or j.isalpha()):
            post+=j
        else:
            temp.append(j)
    emojis.append(temp)
    data['Post'][index]=post
    index=0
    
data['emojis']=emojis

data

In [None]:
#english specific preprocessing

In [None]:
#Translating hindi to english using "googletrans" API
# import time
# errorCounter=0
# cnt=0
# eng=[]
# f = open("res.txt", "a")
# f.write("Now the file has more content!")
# for i in range(5728):
#     t=Translator()
#     cnt+=1
#     time.sleep(1)
    
#     print(cnt,errorCounter)
#     # if i%5==0:
#     #     time.sleep(1)
#     try:
#         f.write(t.translate(translationInput[i]).text)
#         f.write('\n')
#     except:
#         errorCounter+=1
# print("Total Loss = " + str(errorCounter))

In [None]:
#lowercasing 

f = open("res.txt", "r")
englishPost=list(f.read().split('\n'))

for i in range(len(englishPost)):
    englishPost[i]=englishPost[i].lower()

englishPost

In [None]:
#Stemming

from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
   
ps = PorterStemmer()
index=0

for i in englishPost:
    sentence=''
    for word in i:
        sentence+=ps.stem(word)
    englishPost[index]=sentence
    index+=0
    
englishPost

In [None]:
#removing stop words from english

f=open("punctuations.txt","r")
stopWordsEnglish=set(f.read().split('\n'))

index=0

for i in englishPost:
    words=i.split(' ')
    sentence=''
    for word in words:
        if word not in stopWordsEnglish:
            sentence+=word
    englishPost[index]=sentence
    index=0

englishPost