# Set Up

In [1]:
!pip install mumin[all]==1.6.2 torchmetrics==0.7.2 --quiet
!pip install dgl-cu111==0.7.2 -f https://data.dgl.ai/wheels/repo.html --quiet
!pip install googletrans==3.1.0a0



In [2]:
from mumin import MuminDataset
from google.colab import drive
from getpass import getpass
from pathlib import Path
import shutil
import re
from PIL import Image
import itertools as it

In [3]:
twitter_bearer_token = getpass('Input Twitter Bearer Token: ')
dataset = MuminDataset(twitter_bearer_token=twitter_bearer_token)

Input Twitter Bearer Token: ··········


In [4]:
drive_dir = Path('drive')
drive.mount(str(drive_dir.resolve()), force_remount=True)
drive_content_dir = [child for child in drive_dir.iterdir() 
                     if re.search(r'My ?Drive', str(child.stem)) is not None][0]
shutil.copy(drive_content_dir / 'mumin-small.zip', 'mumin-small.zip')

Mounted at /content/drive


'mumin-small.zip'

In [5]:
dataset.compile()

2022-04-18 02:40:53,763 [INFO] Loading dataset


MuminDataset(num_nodes=386,567, num_relations=472,552, size='small', compiled=True)

In [6]:
tweet_df = dataset.nodes['tweet']
tweet_df.dropna(inplace=True)

# Preprocessing and Translation

### Preprocessing
Creating full dataframe

In [7]:
import numpy as np

In [8]:
claim_df = dataset.nodes['claim']
discusses_df = dataset.rels[('tweet', 'discusses', 'claim')]

In [9]:
tweet_claim_df = (tweet_df.merge(discusses_df, left_index=True, right_on='src')
                          .merge(claim_df, left_on='tgt', right_index=True)
                          .reset_index(drop=True))

In [10]:
tweet_claim_df.head(3)

Unnamed: 0,tweet_id,text,created_at,lang,source,num_retweets,num_replies,num_quote_tweets,src,tgt,...,label,reviewers,date,language,keywords,cluster_keywords,cluster,train_mask,val_mask,test_mask
0,1238947475471454220,Antes de llegar a los pulmones dura 4 días en ...,2020-03-14 21:57:51,es,Twitter for Android,8,3,0,0,0,...,misinformation,[observador.pt],2020-03-15 12:30:21,pt,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,0,True,False,False
1,1295062953000042496,Aeroporto de Dubai em chamas. 🤕😧 https://t.co/...,2020-08-16 18:20:43,pt,Twitter for Android,6,0,5,1,1,...,misinformation,[observador.pt],2020-08-17 17:14:55,pt,big dubai airport,,-1,True,False,False
2,1294614020008312832,Fogo 🔥 no aeroporto de Dubai 😱😱 https://t.co/2...,2020-08-15 12:36:49,pt,Twitter for Android,24,11,7,2,1,...,misinformation,[observador.pt],2020-08-17 17:14:55,pt,big dubai airport,,-1,True,False,False


### Translation


In [11]:
import googletrans
from googletrans import Translator

In [12]:
translator = Translator()

In [15]:
def clean_text(text):
  links = re.findall(r"http\S+",text)
  #mentions = re.findall(r"@\S+",text)
  text = re.sub(r"http\S+","",text) 
  #text = re.sub(r"@\S+","",text) 

  return text, links #mentions

In [16]:
clean_texts = tweet_claim_df.text.apply(clean_text)

In [17]:
tweet_claim_df["clean_text"] = clean_texts.apply(lambda x: x[0])
tweet_claim_df["clean_links"] = clean_texts.apply(lambda x: x[1])
#tweet_claim_df["clean_mentions"] = clean_texts.apply(lambda x: x[2])

In [18]:
def translate(text,lang):
  try: 
    if lang != 'en':
      translation = translator.translate(text, src=lang, dest='en')
      return translation.text
    else:
      return text
  except:
    translation = translator.translate(text, dest='en')
    return translation.text

In [19]:
translations = []
for i in range(len(tweet_claim_df)):
  translations.append(translate(tweet_claim_df.loc[i, "clean_text"], tweet_claim_df.loc[i, "lang"]))

In [20]:
tweet_claim_df["en_text"] = translations 

# Save Final DataFrame 
Combine translations and save final data frame

In [21]:
for i in range(len(tweet_claim_df)):
  #tweet_claim_df.loc[i,"full_en_text"] = tweet_claim_df.loc[i,"en_text"] + " " + " ".join(tweet_claim_df.loc[i,"clean_mentions"]) + " " + " ".join(tweet_claim_df.loc[i,"clean_links"])
  #tweet_claim_df.loc[i,"full_m_text"]  = tweet_claim_df.loc[i,"clean_text"] + " " + " ".join(tweet_claim_df.loc[i,"clean_mentions"]) + " " + " ".join(tweet_claim_df.loc[i,"clean_links"])
  tweet_claim_df.loc[i,"full_en_text"] = tweet_claim_df.loc[i,"en_text"] + " " + " ".join(tweet_claim_df.loc[i,"clean_links"])
  tweet_claim_df.loc[i,"full_m_text"]  = tweet_claim_df.loc[i,"clean_text"] + " " + " ".join(tweet_claim_df.loc[i,"clean_links"])

In [22]:
tweet_claim_df.head(3)

Unnamed: 0,tweet_id,text,created_at,lang,source,num_retweets,num_replies,num_quote_tweets,src,tgt,...,cluster_keywords,cluster,train_mask,val_mask,test_mask,clean_text,clean_links,en_text,full_en_text,full_m_text
0,1238947475471454220,Antes de llegar a los pulmones dura 4 días en ...,2020-03-14 21:57:51,es,Twitter for Android,8,3,0,0,0,...,coronavirus china covid 19 treatments recommended,0,True,False,False,Antes de llegar a los pulmones dura 4 días en ...,[https://t.co/Z7EUDqcAlJ],"Before reaching the lungs, it lasts 4 days in ...","Before reaching the lungs, it lasts 4 days in ...",Antes de llegar a los pulmones dura 4 días en ...
1,1295062953000042496,Aeroporto de Dubai em chamas. 🤕😧 https://t.co/...,2020-08-16 18:20:43,pt,Twitter for Android,6,0,5,1,1,...,,-1,True,False,False,Aeroporto de Dubai em chamas. 🤕😧,[https://t.co/OwoMvD9fjz],Dubai airport on fire. 🤕😧,Dubai airport on fire. 🤕😧 https://t.co/OwoMvD9fjz,Aeroporto de Dubai em chamas. 🤕😧 https://t.co...
2,1294614020008312832,Fogo 🔥 no aeroporto de Dubai 😱😱 https://t.co/2...,2020-08-15 12:36:49,pt,Twitter for Android,24,11,7,2,1,...,,-1,True,False,False,Fogo 🔥 no aeroporto de Dubai 😱😱,[https://t.co/2Jd96sWip9],Fire 🔥 at Dubai Airport 😱😱,Fire 🔥 at Dubai Airport 😱😱 https://t.co/2Jd96s...,Fogo 🔥 no aeroporto de Dubai 😱😱 https://t.co/...


In [29]:
tweet_claim_df.to_csv("/content/drive/MyDrive/mumin-small-final.csv")