<a href="https://colab.research.google.com/github/talhaanwarch/Profiling-Hate-Speech-Spreaders-on-Twitter/blob/main/bert_tweet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi -L

GPU 0: Tesla P4 (UUID: GPU-ddc6f9a6-17cb-d5ea-f94f-b205f7fb6484)


#install pacakges

In [2]:
%%capture
!python3 -m pip install -U pip
!python3 -m pip install -U setuptools wheel
!python3 -m pip install -U "mxnet_cu110<2.0.0"
!python3 -m pip install autogluon

In [3]:
%%capture
!pip install -U transformers
!pip install emoji

# download data

In [4]:
import os
if os.path.isdir('/content/Profiling-Hate-Speech-Spreaders-on-Twitter') is False:
    !git clone https://github.com/talhaanwarch/Profiling-Hate-Speech-Spreaders-on-Twitter.git

In [5]:
%%capture 
if os.path.isdir('train') is False:
  !unzip /content/Profiling-Hate-Speech-Spreaders-on-Twitter/data/train.zip
if os.path.isdir('test') is False:
  !unzip /content/Profiling-Hate-Speech-Spreaders-on-Twitter/data/test.zip

# prepare data

## train data

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df=pd.read_csv('train/en/truth.txt',sep=':::',header=None,engine='python')
df.columns=['id','label']

In [7]:
df.label.value_counts()

1    100
0    100
Name: label, dtype: int64

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop = stopwords.words('english')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
import emoji
def preprocessing(text):
    text=text.str.replace('\d+', '')
    text=text.str.replace('RT','')
    text=text.str.replace('#USER#','@USER')
    text=text.str.replace('_','')
    text=text.str.replace('#URL#','HTTPURL')
    #text= text.str.lower()
    text=text.apply(lambda x:emoji.demojize(x))
    text = text.str.replace('[^\w\s]','')
    #text = text.apply(lambda x : [lemmatizer.lemmatize(y) for y in w_tokenizer.tokenize(x)])
    #text = text.apply(lambda x: [item for item in x if item not in stop])
    #text = text.apply(lambda x : " ".join(x))
    return text

In [10]:
df.id='train/en/'+df.id.astype(str)+'.xml'
df.head()

Unnamed: 0,id,label
0,train/en/639b8e5e6a527d494c85d8f5704b1a01.xml,0
1,train/en/f2b1fc84c500c38a93522efbd422b559.xml,0
2,train/en/10b2d013382e1fb3c9414ea28329f258.xml,0
3,train/en/26644d1348fc1122e8c5ef45d6bc84fa.xml,0
4,train/en/4d4c5dcbfe38d0d33a0d1b1419952ca8.xml,0


In [11]:
len(df)

200

In [12]:
#read data
import xml.etree.ElementTree as ET
def reader(df,ground=True):
  data=[]
  for x in df.iterrows():
      
      tree = ET.parse(x[1].id)
      root = tree.getroot()
      text=[x.text for x in root[0]]
      if ground:
        label=[x[1].label]*len(text)
        data.append(pd.DataFrame(zip(text,label),columns=['text','label']))
      else:
        data.append(pd.DataFrame(text,columns=['text']))
  return data

In [13]:
data=reader(df)

In [14]:
data[0].head()

Unnamed: 0,text,label
0,RT #USER#: Funny how “15 days to slow the spre...,0
1,RT #USER#: Why did Minneapolis just give Georg...,0
2,"RT #USER#: To be fair, he has done a lot of un...",0
3,RT #USER#: President Trump got us the #HASHTAG...,0
4,RT #USER#: Is the case against former officer ...,0


In [15]:
#clean data
datax=data.copy()
for d in range(len(data)):
  datax[d].text=preprocessing(data[d].text)


In [16]:
datax[0].head()

Unnamed: 0,text,label
0,USER Funny how days to slow the spread turne...,0
1,USER Why did Minneapolis just give George Flo...,0
2,USER To be fair he has done a lot of undercov...,0
3,USER President Trump got us the HASHTAG vacci...,0
4,USER Is the case against former officer Derek...,0


In [17]:
print(np.mean([np.mean([len(i.split()) for i in j.text]) for j in datax]))
print(np.mean([np.std([len(i.split()) for i in j.text]) for j in datax]))
#find length of sequence for bert

11.720574999999998
4.910872147926636


In [18]:
len(datax[0])

200

In [19]:
doc_text,doc_label,=[],[]
for i in range(len(datax)):
  doc_text.append(list(datax[i].text))
  doc_label.append(int(datax[i].mean())) #convert data to list

In [20]:
len(doc_text),len(doc_label)

(200, 200)

## test data

In [21]:
from glob import glob #read test data
test_path=pd.DataFrame(glob('test/en/*.xml'),columns=['id'])
test_path.head()

Unnamed: 0,id
0,test/en/f2973063f16c0698a6de86c55b5f9ad6.xml
1,test/en/be71e41921653e523aceb555727b61c7.xml
2,test/en/d0235c3e74f79298fc2b6d2427c2060b.xml
3,test/en/314d4bf068314367bc83dc5ae3757e13.xml
4,test/en/fdd27881da241cff7115c1e89caba6dd.xml


In [22]:
test_data=reader(test_path,ground=False) #convert to dataframe

In [23]:
#clean
test_datax=test_data.copy()
for d in range(len(test_datax)):
  test_datax[d].text=preprocessing(test_datax[d].text)


In [24]:
test_doc=[]
for i in range(len(test_datax)):
  test_doc.append(list(test_datax[i].text))
  #convert to list

# generate embeddings

In [25]:
from transformers import AutoTokenizer, AutoModel
from torch.nn.utils.rnn import pad_sequence
import torch
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",)
model = AutoModel.from_pretrained("vinai/bertweet-base", output_hidden_states=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
device='cuda'
model=model.to(device)

In [27]:
#get data from last hidden layer
def create_embeddings(doc_text,max_length=20):
  doc_embeddings=[]
  for text in doc_text:
    inputs=tokenizer(text,max_length=max_length,padding=True,truncation=True)
    text_seq = torch.tensor(inputs['input_ids']).to(device)
    text_mask = torch.tensor(inputs['attention_mask']).to(device)
    model.eval()
    with torch.no_grad():
      outputs= model(text_seq,attention_mask=text_mask)
      hidden_states=outputs.hidden_states
      emb=torch.mean(hidden_states[11], 1)
      doc_embeddings.append(emb)
  x=torch.mean(torch.stack(doc_embeddings),1)
  x=np.array(x.cpu())
  return x

In [28]:
%%time
x=create_embeddings(doc_text)

CPU times: user 43.9 s, sys: 303 ms, total: 44.2 s
Wall time: 44.2 s


In [29]:
y=doc_label
len(x),len(y)

(200, 200)

In [30]:
%%time
test=create_embeddings(test_doc)
print(len(test))

100
CPU times: user 22.5 s, sys: 37.4 ms, total: 22.5 s
Wall time: 22.5 s


# AutoGluon

In [31]:
data=pd.concat([pd.DataFrame(x),pd.DataFrame(y)],axis=1)
data.columns=['col_{}'.format(i) for i in range(x.shape[1])]+['Label']
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30,col_31,col_32,col_33,col_34,col_35,col_36,col_37,col_38,col_39,...,col_729,col_730,col_731,col_732,col_733,col_734,col_735,col_736,col_737,col_738,col_739,col_740,col_741,col_742,col_743,col_744,col_745,col_746,col_747,col_748,col_749,col_750,col_751,col_752,col_753,col_754,col_755,col_756,col_757,col_758,col_759,col_760,col_761,col_762,col_763,col_764,col_765,col_766,col_767,Label
0,0.01954,0.008609,0.001659,-0.003287,0.001143,0.033562,0.052619,0.004695,0.032431,0.001853,0.01768,0.0595,0.010611,-0.006947,0.042606,-0.037218,0.023059,-0.030086,0.047786,-0.023791,-0.01674,-0.065161,-0.021205,-0.081362,-0.008381,0.057553,0.019333,0.027778,-0.000735,0.003062,-0.01248,0.002226,-0.048325,0.035086,-0.001859,-0.026294,0.030446,-0.006326,-0.013032,-0.088069,...,0.001008,-0.023623,-0.019958,0.000861,0.007349,-0.015087,0.020598,0.014583,-0.004601,-0.02299,0.059803,0.049352,-0.022678,0.040106,0.018941,-0.012479,0.014301,0.007051,0.029341,0.068116,-0.014705,0.004012,-0.004629,0.026379,-0.070327,0.086918,0.025259,0.090206,0.005828,0.018156,-0.002895,-0.002812,-0.017123,0.048147,-0.009066,0.005929,-0.009341,0.018544,0.043875,1
1,0.033508,0.028093,0.011475,0.001979,-0.004846,-0.003879,-4.1e-05,-0.002344,0.009437,-0.002011,0.000448,0.039067,0.000773,0.005494,0.042565,0.005873,0.014977,0.010682,0.070448,-0.006322,0.000291,-0.005635,0.021247,-0.056486,-0.02083,0.005524,0.010384,0.014498,0.02271,0.000889,-0.027603,-0.032447,-0.027857,-0.001672,0.006661,0.063793,0.011284,0.047726,0.006134,-0.048277,...,0.003795,-0.027629,-0.065584,-0.011375,0.029215,-0.050552,-0.01025,-0.018103,-0.006406,0.044304,0.027917,0.016143,-0.002386,0.016782,0.014546,0.015021,-0.020895,0.082204,0.031653,0.030579,0.000182,-0.037253,-0.014962,-0.001569,-0.040384,-0.006815,0.013729,0.064194,0.018737,0.021318,-0.008985,-0.015676,0.008217,-0.012747,-0.036342,0.009676,0.012219,0.040918,0.105558,0
2,-0.008351,0.017551,0.009074,0.006442,-0.010835,0.000692,0.02669,0.018383,0.015294,0.009221,-0.007115,0.045301,0.020623,-0.022219,0.011347,-0.021273,0.014698,-0.019046,0.053537,-0.002156,-0.001828,-0.026127,-0.012837,-0.067483,0.009723,0.028836,0.013697,-0.002869,0.060419,-0.014623,-0.003771,0.013001,-0.021046,0.024194,0.00589,0.008021,0.022967,0.019159,0.003265,-0.062535,...,0.020937,0.010689,-0.031185,0.008981,0.033551,-0.011218,0.015166,-0.001994,-0.004475,0.01178,0.015588,0.049849,-0.0282,0.05576,0.004412,-0.013559,-0.002564,0.002169,-0.006656,0.048428,-0.000798,-0.004872,-0.005108,0.024482,-0.071775,0.076504,0.011089,0.084333,0.011414,0.01805,0.004057,0.009662,-0.00557,0.025379,-0.027584,-0.009082,0.005345,0.042779,0.050775,1
3,0.012651,0.022093,-0.004498,0.012453,-0.024709,0.041618,0.039298,0.011487,0.00881,0.061249,-0.005716,0.030278,0.005301,-0.019966,0.048614,-0.011641,0.019922,-0.020296,0.066449,-0.000113,0.021452,-0.077187,-0.00509,-0.062084,0.021774,0.034375,0.014642,0.008343,0.021328,-0.009128,-0.015257,0.02898,-0.010954,-0.001067,0.001562,-0.02379,0.028283,0.00611,-0.003525,-0.09994,...,0.007896,-0.005893,-0.039402,0.02971,0.037473,-0.015743,0.019942,0.012225,-0.014714,0.014092,0.024035,-0.008263,-0.0012,0.067802,0.004408,-0.015227,-0.014965,-0.040012,-0.015752,0.015245,-0.004686,0.013585,-0.00596,0.032274,-0.071044,0.072323,0.029705,0.089797,0.001139,0.009156,0.005829,0.016343,-0.006164,0.036924,-0.026579,-0.015197,0.010169,0.039137,0.072062,0
4,0.001741,0.006838,0.011289,0.001854,-0.001393,0.000324,0.006192,0.004335,0.023194,-0.008308,-0.012467,0.048771,0.001201,0.001151,0.005848,-0.015817,0.001053,0.023844,0.075005,-0.010226,0.00689,-0.010575,-0.022384,-0.050788,-0.03502,0.015101,0.01247,0.014693,0.04297,-0.002295,-0.015092,-0.004717,-0.019155,0.01599,0.009017,0.014141,0.004435,0.049979,0.003252,-0.083084,...,0.003401,-0.005345,-0.04399,0.006939,0.009737,-0.005398,-0.011313,-0.016018,-0.00819,0.048535,-0.017254,0.054355,-0.042012,0.001408,-0.010972,-0.003379,-0.019748,0.0535,0.028845,0.02465,-0.005348,-0.019101,-0.011293,-0.002115,-0.04067,0.012335,0.000502,0.06357,-0.006034,0.029311,-0.027523,-0.01321,0.004327,-0.005271,-0.027062,-0.001921,-0.000701,0.011695,0.047907,0


In [32]:
test_df=pd.DataFrame(test)
test_df.columns=['col_{}'.format(i) for i in range(test.shape[1])]
test_df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30,col_31,col_32,col_33,col_34,col_35,col_36,col_37,col_38,col_39,...,col_728,col_729,col_730,col_731,col_732,col_733,col_734,col_735,col_736,col_737,col_738,col_739,col_740,col_741,col_742,col_743,col_744,col_745,col_746,col_747,col_748,col_749,col_750,col_751,col_752,col_753,col_754,col_755,col_756,col_757,col_758,col_759,col_760,col_761,col_762,col_763,col_764,col_765,col_766,col_767
0,0.001754,0.024795,0.004462,0.023531,-0.000514,0.021803,0.032669,0.0114,0.019625,0.019211,-0.031634,0.041581,0.032604,0.014153,0.032584,-0.019381,0.012158,0.000668,0.036718,-0.009588,-0.026324,-0.045127,0.004489,-0.044984,0.025092,0.023681,0.000446,0.011683,0.045299,-0.003765,0.006639,0.007357,-0.010212,0.012721,-0.021091,0.006044,-0.002831,0.016584,0.001313,-0.14175,...,0.008706,0.046146,0.02149,-0.05739,0.007033,0.041118,0.028251,0.021366,0.008927,-0.021163,-0.009261,0.042663,0.052698,-0.018768,0.099278,0.001892,-0.021597,-0.020557,-0.031995,0.003246,0.018443,-0.031394,0.002089,-0.014686,0.01909,-0.083967,0.070653,0.016554,0.072213,0.018719,-0.002226,-0.043061,-0.01267,0.003802,0.058948,-0.021597,-0.01289,0.039327,0.031281,0.067092
1,-0.051948,-0.004511,0.02665,-0.042148,-0.000822,0.08877,0.072916,0.017292,-0.031195,0.088828,-0.00644,0.089824,-0.054378,-0.026148,-0.022116,-0.041793,0.012314,-0.067712,0.105116,0.036269,-0.061616,-0.02185,-0.0389,-0.085158,-0.020362,0.016822,0.03063,0.107706,0.071346,-0.02395,-0.009565,-0.015033,0.02235,0.01229,0.03676,-0.027276,0.091931,-0.075555,0.023854,-0.14088,...,0.057774,-0.131306,0.034544,-0.009475,0.095725,0.014392,0.005084,-0.018414,0.055759,0.055017,-0.005136,-0.022165,0.015826,-0.02064,0.000507,-0.003601,-0.069692,0.019264,-0.029995,-0.001665,0.05718,0.02607,-0.027262,0.042294,-0.013222,-0.064247,0.161862,0.013918,0.108296,0.030186,0.034229,-0.037477,-0.01007,-0.009269,0.025104,-0.030197,-0.007123,-0.010471,-0.034998,0.009148
2,0.007791,0.019822,0.007637,-0.009873,-0.005838,0.013319,0.037538,0.028747,0.02285,0.028781,0.001351,0.048016,0.009161,-0.006895,0.025722,-0.025014,0.024292,-0.041152,0.068784,-0.000442,-0.004778,-0.056442,-0.012329,-0.063068,-0.008681,0.028039,0.02142,0.027004,0.080795,-0.013304,-0.006013,0.015192,-0.024167,0.022336,0.01202,-0.003121,0.023176,-0.001796,-0.014873,-0.073905,...,0.050149,0.021367,0.005813,-0.029883,0.02023,0.026668,-0.024716,0.016242,-0.019423,-0.0111,0.001614,0.019643,0.036184,0.002698,0.052374,-0.009242,-0.010173,-6e-06,-0.020805,0.0099,0.058705,-0.004611,-0.017624,-0.001931,0.028311,-0.098946,0.065244,0.018798,0.085048,0.03254,0.011901,-0.01074,0.000621,-0.006504,0.023472,-0.015728,-0.015181,0.000168,0.034005,0.052392
3,0.021141,0.035219,-0.011929,-0.009078,-0.000629,0.000824,-0.000587,-0.009307,-0.005538,-0.005704,-0.012918,0.020378,0.013326,-0.002653,0.041084,0.010552,-0.012909,0.009885,0.057803,-0.029851,0.006994,-0.032244,0.020394,-0.049839,-0.006741,-0.006648,0.002552,0.050298,0.009181,0.018706,-0.018657,-0.026027,-0.01764,0.021673,-0.00047,0.054092,0.01684,0.005629,-0.002634,-0.064661,...,0.012425,-0.027362,-0.033903,-0.058426,0.010607,0.038053,-0.074582,-0.02173,0.005448,0.003526,0.041138,0.06168,0.011414,-0.023021,0.005707,0.006361,0.007973,-0.018491,0.052566,0.037613,0.016803,-0.010327,-0.04067,-0.024227,0.003977,-0.022236,0.035796,0.021845,0.079879,0.01455,0.01297,-0.002415,0.000121,-0.005612,0.045515,-0.040167,0.003144,0.017503,0.035809,0.076333
4,-0.01978,0.009907,0.026828,-0.000532,0.000214,0.01,0.027658,0.039714,0.00468,0.008388,-0.019198,0.045436,0.016963,0.005281,0.017419,-0.010952,0.011683,-0.032281,0.079293,0.004996,0.005919,-0.06711,-0.013999,-0.046911,-0.003722,0.076137,0.008824,-0.041124,0.108075,-0.003796,0.012377,0.024645,-0.040168,0.023545,0.003189,0.009605,-0.017241,0.064777,-0.006667,-0.084374,...,0.018089,0.037997,0.007748,-0.052683,0.015847,0.039029,-0.006674,0.031491,0.021852,0.003699,0.081016,0.002571,0.02312,0.005211,0.06119,0.00226,-0.033711,-0.009587,-0.008022,0.009238,0.05486,-0.01299,-0.014217,0.002119,0.024302,-0.076504,0.029595,0.021265,0.114605,0.023795,0.022583,-0.036615,0.007952,0.009883,0.056019,-0.03718,0.017394,0.017366,0.080706,0.103436


In [33]:
%%time
from autogluon.tabular import  TabularPredictor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report,accuracy_score
skf = StratifiedKFold(n_splits=5)

acc=[]
test_pred=[]
for train_index, test_index in skf.split(data.iloc[:,0:-1], data.iloc[:,-1]):
  train=data.iloc[train_index,:]
  test=data.iloc[test_index,:]
  clf=TabularPredictor(label='Label',verbosity=0).fit(train, presets='best_quality')
  res=clf.leaderboard(test,silent=True)
  res['score_test']=res['score_test'].round(3)*100
  pred = clf.predict(test,model=res.model[0])
  accuracy=accuracy_score(test.Label,pred)
  acc.append(accuracy)
  print('best model ',res.model[0],' accuracy is ',accuracy)
  test_pred.append(clf.predict(test_df,model=res.model[0]))
print('average accuracy is',np.mean(acc))

best model  RandomForestEntr_BAG_L1  accuracy is  0.7
best model  KNeighborsDist_BAG_L1  accuracy is  0.775
best model  RandomForestGini_BAG_L1  accuracy is  0.675
best model  LightGBMLarge_BAG_L1  accuracy is  0.75
best model  RandomForestGini_BAG_L1  accuracy is  0.7
average accuracy is 0.7200000000000001
CPU times: user 20min 7s, sys: 14.8 s, total: 20min 22s
Wall time: 12min 16s


In [34]:
ids=test_path.id.apply(lambda x:x.split('/')[-1].split('.')[0])
pred=np.mean(test_pred,axis=0)
lang=['en']*len(test_path)
pred=np.where(pred>0.5,1,0)

In [35]:
final=pd.DataFrame(zip(ids,lang,pred),columns=['author-id','lang','type'])
final.head()

Unnamed: 0,author-id,lang,type
0,f2973063f16c0698a6de86c55b5f9ad6,en,1
1,be71e41921653e523aceb555727b61c7,en,0
2,d0235c3e74f79298fc2b6d2427c2060b,en,0
3,314d4bf068314367bc83dc5ae3757e13,en,1
4,fdd27881da241cff7115c1e89caba6dd,en,1


In [36]:
final.to_csv('first.csv',index=False)

In [37]:
if os.path.isdir('output'):
  !rm -rf output
  !mkdir output
  !rm talha.zip
else:
  !mkdir output

rm: cannot remove 'talha.zip': No such file or directory


In [38]:
import pandas as pd
df=final.copy()
import pandas as pd
for row in df.index:
    xml = []
    xml.append('<author id="{}"'.format(df.loc[row,'author-id']))
    xml.append('lang="{}"'.format(df.loc[row,'lang']))
    xml.append('type="{}"'.format(df.loc[row,'type']))
    xml.append('/>')
    x='\n'.join(xml)
    myfile = open("output/{}.xml".format(df.loc[row,'author-id']), "w")
    myfile.write(x)
    myfile.close()
    

In [39]:
%%capture
!zip -r talha.zip output