<a href="https://colab.research.google.com/github/talhaanwarch/Profiling-Hate-Speech-Spreaders-on-Twitter/blob/main/en_ak.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-41ed80c3-5105-8f43-1108-9ed46036dd2c)


#install pacakges

In [2]:
%%capture
!python3 -m pip install autokeras

# download data

In [3]:
import os
if os.path.isdir('/content/Profiling-Hate-Speech-Spreaders-on-Twitter') is False:
    !git clone https://github.com/talhaanwarch/Profiling-Hate-Speech-Spreaders-on-Twitter.git

In [4]:
%%capture 
if os.path.isdir('train') is False:
  !unzip /content/Profiling-Hate-Speech-Spreaders-on-Twitter/data/train.zip
if os.path.isdir('test') is False:
  !unzip /content/Profiling-Hate-Speech-Spreaders-on-Twitter/data/test.zip

# prepare data

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df=pd.read_csv('train/en/truth.txt',sep=':::',header=None,engine='python')
df.columns=['id','label']

In [6]:
df.label.value_counts()

1    100
0    100
Name: label, dtype: int64

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop = stopwords.words('english')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def preprocessing(text):
    text=text.str.replace('\d+', '')
    text=text.str.replace('RT','')
    text=text.str.replace('#USER#','')
    text=text.str.replace('#URL#','')
    text= text.str.lower()
    text = text.str.replace('[^\w\s]','')
    text = text.apply(lambda x : [lemmatizer.lemmatize(y) for y in w_tokenizer.tokenize(x)])
    text = text.apply(lambda x: [item for item in x if item not in stop])
    text = text.apply(lambda x : " ".join(x))
    return text

In [9]:
df.id='train/en/'+df.id.astype(str)+'.xml'
df.head()

Unnamed: 0,id,label
0,train/en/639b8e5e6a527d494c85d8f5704b1a01.xml,0
1,train/en/f2b1fc84c500c38a93522efbd422b559.xml,0
2,train/en/10b2d013382e1fb3c9414ea28329f258.xml,0
3,train/en/26644d1348fc1122e8c5ef45d6bc84fa.xml,0
4,train/en/4d4c5dcbfe38d0d33a0d1b1419952ca8.xml,0


In [10]:
len(df)

200

In [11]:
import xml.etree.ElementTree as ET
def reader(df,ground=True):
  data=[]
  for x in df.iterrows():
      
      tree = ET.parse(x[1].id)
      root = tree.getroot()
      text=[x.text for x in root[0]]
      if ground:
        label=[x[1].label]*len(text)
        data.append(pd.DataFrame(zip(text,label),columns=['text','label']))
      else:
        data.append(pd.DataFrame(text,columns=['text']))
  return data

In [12]:
data=reader(df)

In [13]:
data[0].head()

Unnamed: 0,text,label
0,RT #USER#: Funny how “15 days to slow the spre...,0
1,RT #USER#: Why did Minneapolis just give Georg...,0
2,"RT #USER#: To be fair, he has done a lot of un...",0
3,RT #USER#: President Trump got us the #HASHTAG...,0
4,RT #USER#: Is the case against former officer ...,0


In [14]:
from sklearn.model_selection import train_test_split
trainx,valx=train_test_split(data,test_size=0.1,)
trainx,testx=train_test_split(trainx,test_size=0.1)

In [15]:
train=pd.concat(trainx)
val=pd.concat(valx)
test=pd.concat(testx)

In [16]:
train.label.value_counts()

0    16800
1    15600
Name: label, dtype: int64

In [17]:
val.label.value_counts()

1    2600
0    1400
Name: label, dtype: int64

In [18]:
test.label.value_counts()

1    1800
0    1800
Name: label, dtype: int64

In [19]:
train.text=preprocessing(train.text)
val.text=preprocessing(val.text)
test.text=preprocessing(test.text)

In [20]:
len(train),len(val),len(test)

(32400, 4000, 3600)

In [21]:
train = train.sample(frac=1).reset_index(drop=True)
val = val.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)


In [22]:
train_text=train.text.values
train_label=train.label.values
val_text=val.text.values
val_label=val.label.values
test_text=test.text.values
test_label=test.label.values

# classify data

In [23]:
import autokeras as ak
clf = ak.TextClassifier() 

INFO:tensorflow:Reloading Oracle from existing project ./text_classifier/oracle.json
INFO:tensorflow:Reloading Tuner from ./text_classifier/tuner0.json


In [24]:
clf.fit(train_text,train_label,epoch=100,validation_data=(val_text, val_label))


INFO:tensorflow:Oracle triggered exit
INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets


In [25]:
val_score = clf.evaluate(val_text, val_label)



In [26]:
test_score = clf.evaluate(test_text,test_label)



In [27]:
from sklearn.metrics import classification_report
pred=clf.predict(val_text)
print(classification_report(val_label,pred))

              precision    recall  f1-score   support

           0       0.52      0.59      0.55      1400
           1       0.76      0.71      0.74      2600

    accuracy                           0.67      4000
   macro avg       0.64      0.65      0.64      4000
weighted avg       0.68      0.67      0.67      4000



In [28]:
pred=clf.predict(test_text)
print(classification_report(test_label,pred))

              precision    recall  f1-score   support

           0       0.70      0.62      0.66      1800
           1       0.66      0.74      0.70      1800

    accuracy                           0.68      3600
   macro avg       0.68      0.68      0.68      3600
weighted avg       0.68      0.68      0.68      3600



# validation

In [29]:
preds,trues=[],[]
for i in valx:
  pred=list(clf.predict(i.text.values,verbose=0))
  preds.append(max(pred,key=pred.count))
  trues.append(int(np.mean(i.label)))
print(classification_report(trues,preds))  

              precision    recall  f1-score   support

           0       0.35      1.00      0.52         7
           1       0.00      0.00      0.00        13

    accuracy                           0.35        20
   macro avg       0.17      0.50      0.26        20
weighted avg       0.12      0.35      0.18        20



  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
preds,trues=[],[]
for i in testx:
  pred=list(clf.predict(i.text.values,verbose=0))
  preds.append(max(pred,key=pred.count))
  trues.append(int(np.mean(i.label)))
print(classification_report(trues,preds))  

              precision    recall  f1-score   support

           0       0.53      1.00      0.69         9
           1       1.00      0.11      0.20         9

    accuracy                           0.56        18
   macro avg       0.76      0.56      0.45        18
weighted avg       0.76      0.56      0.45        18



# test evaluation

In [31]:
from glob import glob
test_path=pd.DataFrame(glob('test/en/*.xml'),columns=['id'])
test_path.head()

Unnamed: 0,id
0,test/en/f2973063f16c0698a6de86c55b5f9ad6.xml
1,test/en/be71e41921653e523aceb555727b61c7.xml
2,test/en/d0235c3e74f79298fc2b6d2427c2060b.xml
3,test/en/314d4bf068314367bc83dc5ae3757e13.xml
4,test/en/fdd27881da241cff7115c1e89caba6dd.xml


In [32]:
test=reader(test_path,ground=False)

In [33]:
test[0].head()

Unnamed: 0,text
0,#USER# I'm so sorry 😞...
1,#USER# #USER# That is correct.
2,RT #USER#: The legitimate President of the Uni...
3,#USER# #USER# #USER# Texas AG is suing the Bid...
4,#USER# In Jesus Holy Name. Soon the entire wor...


In [None]:
final_preds=[]
for i in range(len(test)):
  test[i].text=preprocessing(test[i].text)
  pred=list(clf.predict(test[i].text.values,verbose=0))
  final_preds.append(int(max(pred,key=pred.count)))


In [None]:
len(final_preds),len(test_path)

In [None]:
ids=test_path.id.apply(lambda x:x.split('/')[-1].split('.')[0])

In [None]:
lang=['en']*len(test_path)

In [None]:
final=pd.DataFrame(zip(ids,lang,final_preds),columns=['author-id','lang','type'])
final.head()