## Downloads and imports libraries needed

In [None]:
import time
%pip install googletrans==3.1.0a0
%pip install tweet-preprocessor --user
%pip install nltk --user
import pandas as pd
import preprocessor as p
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from googletrans import Translator
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

## Method to upload data needed

In [2]:
def data_download(filename):
    data = pd.read_table(filename, sep="\t",header=0).dropna()
    print(data.head())
    return data

## Methods to clean the dataset

In [3]:
import re
def remove_emojis(data):
    
    emoj = re.compile("["
        u"\U00002700-\U000027BF"  # Dingbats
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U00002600-\U000026FF"  # Miscellaneous Symbols
        u"\U0001F300-\U0001F5FF"  # Miscellaneous Symbols And Pictographs
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                      "]+", re.UNICODE)
    text = re.sub(emoj, '', data)
    p.set_options(p.OPT.EMOJI)
    return p.clean(text)

# function for preprocessing tweet in preparation for sentiment analysis
def lower_tweets(text):
    #changing tweet text to small letters
    text = re.sub('(MT|mT|Mt)', '', text)  # remove re-tweet
    text = re.sub('(RT|rT|Rt|rt)', '', text)
    return text
def link_tweet(text):
    # Removing @ and links
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", text).split())
    text = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)  # remove tweeted at
    text = re.sub(r'\\n', '', text)
    return text
def punct_tweet(text):
    # removing punctuation and numbers
    punct = str.maketrans('', '', string.punctuation)
    text = text.translate(punct)
    text = text.lower()
    return text
    # tokenizing words and removing stop words from the tweet text

def lemmatize(text):
    tokens = word_tokenize(text)
    # lemmetizing words
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w) for w in tokens]
    return lemma_words



### Method to translate the dataset

In [4]:
import string

def translate_data(data):
  translator = Translator()
  translator.raise_Exception = True
  data.tweetText = data.tweetText.apply(lambda x : translator.translate(x,dest='en').text)
  return data

In [5]:
def preprocess_label(data):
    data.drop(columns=['timestamp','userId','imageId(s)'])
    data['label'] = data.label.apply(lambda  x : 'fake' if x == 'humor' else x )
    print(data.head())
    return data
def punctuate_data(data):
    data.tweetText = data.tweetText.apply(lambda x : punct_tweet(x))
    return data
def emoji_data(data):
    data.tweetText = data.tweetText.apply(lambda x : remove_emojis(x))
    return data
def lower_data(data):
    data.tweetText = data.tweetText.apply(lambda x : lower_tweets(x))
    return data
def link_data(data):
    data.tweetText = data.tweetText.apply(lambda x : link_tweet(x))
    return data
def lemmatise_data(data):
    data.tweetText = data.tweetText.apply(lambda x : lemmatize(x))
    return data
def tag_data(data):
    regex = re.compile('r^(RT|rT|Rt|rt)')
    retweets = data.tweetText.apply(lambda x : bool(regex.search(str(x))))
    data = data[~retweets]
    return data

In [None]:
train_data = data_download('/content/mediaeval-2015-trainingset.txt')
lower_data(train_data).to_csv('train_lower.csv')
train_data = data_download('/content/mediaeval-2015-trainingset.txt')
punctuate_data(train_data).to_csv('train_punct.csv')
train_data = data_download('/content/mediaeval-2015-trainingset.txt')
emoji_data(train_data).to_csv('train_emoji.csv')
train_data = data_download('/content/mediaeval-2015-trainingset.txt')
lemmatise_data(train_data).to_csv('train_lemmatise.csv')
train_data = data_download('/content/mediaeval-2015-trainingset.txt')
link_data(train_data).to_csv('train_link.csv')
train_data = data_download('/content/mediaeval-2015-trainingset.txt')
tag_data(train_data).to_csv('train_tag.csv')
train_data = data_download('/content/mediaeval-2015-trainingset.txt')
translate_data(train_data).to_csv('train_translations.csv')

              tweetId                                          tweetText  \
0  263046056240115712  ¿Se acuerdan de la película: “El día después d...   
1  262995061304852481  @milenagimon: Miren a Sandy en NY!  Tremenda i...   
2  262979898002534400  Buena la foto del Huracán Sandy, me recuerda a...   
3  262996108400271360     Scary shit #hurricane #NY http://t.co/e4JLBUfH   
4  263018881839411200  My fave place in the world #nyc #hurricane #sa...   

      userId      imageId(s)        username                       timestamp  \
0   21226711  sandyA_fake_46         iAnnieM  Mon Oct 29 22:34:01 +0000 2012   
1  192378571  sandyA_fake_09  CarlosVerareal  Mon Oct 29 19:11:23 +0000 2012   
2  132303095  sandyA_fake_09     LucasPalape  Mon Oct 29 18:11:08 +0000 2012   
3  241995902  sandyA_fake_29     Haaaaarryyy  Mon Oct 29 19:15:33 +0000 2012   
4  250315890  sandyA_fake_15  princess__natt  Mon Oct 29 20:46:02 +0000 2012   

  label  
0  fake  
1  fake  
2  fake  
3  fake  
4  fake  
  

In [6]:
def preProcess_data(data):
    data = translate_data(data)
    data = tag_data(data)
    data = lower_data(data)
    data = emoji_data(data)
    data = link_data(data)
    data = punctuate_data(data)
    data = lemmatise_data(data)
    data = preprocess_label(data)
    return data


### Cleans and translates the training data

In [None]:
train_data = data_download('/content/mediaeval-2015-trainingset.txt')
preProcess_data(train_data).to_csv('train_final.csv')

              tweetId                                          tweetText  \
0  263046056240115712  ¿Se acuerdan de la película: “El día después d...   
1  262995061304852481  @milenagimon: Miren a Sandy en NY!  Tremenda i...   
2  262979898002534400  Buena la foto del Huracán Sandy, me recuerda a...   
3  262996108400271360     Scary shit #hurricane #NY http://t.co/e4JLBUfH   
4  263018881839411200  My fave place in the world #nyc #hurricane #sa...   

      userId      imageId(s)        username                       timestamp  \
0   21226711  sandyA_fake_46         iAnnieM  Mon Oct 29 22:34:01 +0000 2012   
1  192378571  sandyA_fake_09  CarlosVerareal  Mon Oct 29 19:11:23 +0000 2012   
2  132303095  sandyA_fake_09     LucasPalape  Mon Oct 29 18:11:08 +0000 2012   
3  241995902  sandyA_fake_29     Haaaaarryyy  Mon Oct 29 19:15:33 +0000 2012   
4  250315890  sandyA_fake_15  princess__natt  Mon Oct 29 20:46:02 +0000 2012   

  label  
0  fake  
1  fake  
2  fake  
3  fake  
4  fake  
  

## Cleans and translates the test-data

In [9]:
test_data = data_download('/content/mediaeval-2015-testset.txt')
preProcess_data(test_data).to_csv('test_final.csv')

              tweetId                                          tweetText  \
0  578854927457349632  kereeen RT @Shyman33: Eclipse from ISS.... htt...   
1  578874632670953472  Absolutely beautiful! RT @Shyman33: Eclipse fr...   
2  578891261353984000  “@Shyman33: Eclipse from ISS.... http://t.co/C...   
3  578846612312748032        Eclipse from ISS.... http://t.co/En87OtvsU6   
4  578975333841551360  @ebonfigli: Éclipse vue de l'ISS... Autre chos...   

       userId   imageId(s)         username                       timestamp  \
0    70824972  eclipse_01            peay_s  Fri Mar 20 09:45:43 +0000 2015   
1   344707006  eclipse_01   JaredUcanChange  Fri Mar 20 11:04:02 +0000 2015   
2   224839607  eclipse_01          tpjp1231  Fri Mar 20 12:10:06 +0000 2015   
3   134543073  eclipse_01          Shyman33  Fri Mar 20 09:12:41 +0000 2015   
4  1150728872   eclipse_01       Epimethee_  Fri Mar 20 17:44:11 +0000 2015   

  label  
0  fake  
1  fake  
2  fake  
3  fake  
4  fake  
        