# **Initialization**

## **Imports and Installations**

In [None]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [None]:
import pandas as pd
import requests
from urllib.parse import urlparse
import preprocessor as p
import string
import csv
import random

# **Creating Dataframes for Raw and Preprocessed Data**

In [None]:
summ_data_raw = pd.read_csv('summ_data_eng_eng_raw.csv', header=0, index_col=0)
summ_data_raw.head()

Unnamed: 0,tweet_url,tweet,evidence_url,claim_reviewed,current_owner_id,indian,user_id,claim_reviewed_markup_language,topic,tweet_language
7,https://twitter.com/GenPanwar/status/124234712...,Corona Vaccine is ready to be launched in USA.,https://www.altnews.in/image-of-covid-19-test-...,"""Roche Medical company"" will launch the vaccin...",4,1,1,English,Health,English
34,https://twitter.com/BhatkhalkarA/status/124529...,18 nations including USA and UK wants @narendr...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including USA and UK want PM Modi t...,4,1,1,English,Health,English
37,https://twitter.com/prettypadmaja/status/12457...,18 nations including USA and UK wants PM Modi ...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including USA and UK want PM Modi t...,4,1,1,English,Health,English
45,https://twitter.com/TrulyMonica/status/1345776...,190 countries have pre-booked Bharat Biotech v...,https://www.altnews.in/india-tvs-rajat-sharma-...,190 countries have pre-booked Bharat Biotech C...,4,1,1,English,Health,English
88,https://twitter.com/ShadRaza1/status/121946187...,"#Russia's metro station, if u do 30 situps ur ...",https://www.altnews.in/fact-check-moscow-metro...,30 squats guarantees a ticket on the Moscow metro,4,1,1,English,Health,English


## **Preprocessing Steps**
1. Removing URLs and Emojis
2. Replace @ and # by whitespace
3. Replace punctuation by whitespace
4. Lowercasing
5. Removing excess whitespace

## **Preprocessed Data**

In [None]:
summ_data_preprocessed = summ_data_raw.copy(deep=True)
p.set_options(p.OPT.URL, p.OPT.EMOJI)
summ_data_preprocessed['tweet'] = summ_data_preprocessed['tweet'].apply(lambda x: p.clean(x))
summ_data_preprocessed['tweet'] = summ_data_preprocessed['tweet'].apply(lambda x: x.replace('#',' ').replace('@', ' '))
summ_data_preprocessed['tweet'] = summ_data_preprocessed['tweet'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed['tweet'] = summ_data_preprocessed['tweet'].apply(lambda x: x.lower())
summ_data_preprocessed['tweet'] = summ_data_preprocessed['tweet'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed['tweet'] = summ_data_preprocessed['tweet'].apply(lambda x: x.strip())
summ_data_preprocessed['claim_reviewed'] = summ_data_preprocessed['claim_reviewed'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed['claim_reviewed'] = summ_data_preprocessed['claim_reviewed'].apply(lambda x: x.lower())
summ_data_preprocessed['claim_reviewed'] = summ_data_preprocessed['claim_reviewed'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed['claim_reviewed'] = summ_data_preprocessed['claim_reviewed'].apply(lambda x: x.strip())
summ_data_preprocessed.head()

Unnamed: 0,tweet_url,tweet,evidence_url,claim_reviewed,current_owner_id,indian,user_id,claim_reviewed_markup_language,topic,tweet_language
7,https://twitter.com/GenPanwar/status/124234712...,corona vaccine is ready to be launched in usa,https://www.altnews.in/image-of-covid-19-test-...,roche medical company will launch the vaccine ...,4,1,1,English,Health,English
34,https://twitter.com/BhatkhalkarA/status/124529...,18 nations including usa and uk wants narendra...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
37,https://twitter.com/prettypadmaja/status/12457...,18 nations including usa and uk wants pm modi ...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
45,https://twitter.com/TrulyMonica/status/1345776...,190 countries have pre booked bharat biotech v...,https://www.altnews.in/india-tvs-rajat-sharma-...,190 countries have pre booked bharat biotech c...,4,1,1,English,Health,English
88,https://twitter.com/ShadRaza1/status/121946187...,russia s metro station if u do 30 situps ur ti...,https://www.altnews.in/fact-check-moscow-metro...,30 squats guarantees a ticket on the moscow metro,4,1,1,English,Health,English


In [None]:
summ_data_preprocessed.to_csv('summ_data_eng_preprocessed.csv')

## **Emoji Replaced Data**

In [None]:
summ_data_preprocessed_emoji_replaced = summ_data_raw.copy(deep=True)
p.set_options(p.OPT.URL)
summ_data_preprocessed_emoji_replaced['tweet'] = summ_data_preprocessed_emoji_replaced['tweet'].apply(lambda x: p.clean(x))
summ_data_preprocessed_emoji_replaced['tweet'] = summ_data_preprocessed_emoji_replaced['tweet'].apply(lambda x: x.replace('#',' ').replace('@', ' '))
summ_data_preprocessed_emoji_replaced['tweet'] = summ_data_preprocessed_emoji_replaced['tweet'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_emoji_replaced['tweet'] = summ_data_preprocessed_emoji_replaced['tweet'].apply(lambda x: x.lower())
summ_data_preprocessed_emoji_replaced['tweet'] = summ_data_preprocessed_emoji_replaced['tweet'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_emoji_replaced['tweet'] = summ_data_preprocessed_emoji_replaced['tweet'].apply(lambda x: x.strip())
p.set_options(p.OPT.EMOJI)
summ_data_preprocessed_emoji_replaced['tweet'] = summ_data_preprocessed_emoji_replaced['tweet'].apply(lambda x: p.tokenize(x))
summ_data_preprocessed_emoji_replaced['claim_reviewed'] = summ_data_preprocessed_emoji_replaced['claim_reviewed'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_emoji_replaced['claim_reviewed'] = summ_data_preprocessed_emoji_replaced['claim_reviewed'].apply(lambda x: x.lower())
summ_data_preprocessed_emoji_replaced['claim_reviewed'] = summ_data_preprocessed_emoji_replaced['claim_reviewed'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_emoji_replaced['claim_reviewed'] = summ_data_preprocessed_emoji_replaced['claim_reviewed'].apply(lambda x: x.strip())
summ_data_preprocessed_emoji_replaced.head()

Unnamed: 0,tweet_url,tweet,evidence_url,claim_reviewed,current_owner_id,indian,user_id,claim_reviewed_markup_language,topic,tweet_language
7,https://twitter.com/GenPanwar/status/124234712...,corona vaccine is ready to be launched in usa,https://www.altnews.in/image-of-covid-19-test-...,roche medical company will launch the vaccine ...,4,1,1,English,Health,English
34,https://twitter.com/BhatkhalkarA/status/124529...,18 nations including usa and uk wants narendra...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
37,https://twitter.com/prettypadmaja/status/12457...,18 nations including usa and uk wants pm modi ...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
45,https://twitter.com/TrulyMonica/status/1345776...,190 countries have pre booked bharat biotech v...,https://www.altnews.in/india-tvs-rajat-sharma-...,190 countries have pre booked bharat biotech c...,4,1,1,English,Health,English
88,https://twitter.com/ShadRaza1/status/121946187...,russia s metro station if u do 30 situps ur ti...,https://www.altnews.in/fact-check-moscow-metro...,30 squats guarantees a ticket on the moscow metro,4,1,1,English,Health,English


In [None]:
summ_data_preprocessed_emoji_replaced.to_csv('summ_data_eng_preprocessed_emoji_replaced.csv')

## **Hashtag and Mention removed data**

In [None]:
summ_data_preprocessed_hashtag_mention_removed = summ_data_raw.copy(deep=True)
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.HASHTAG)
summ_data_preprocessed_hashtag_mention_removed['tweet'] = summ_data_preprocessed_hashtag_mention_removed['tweet'].apply(lambda x: p.clean(x))
summ_data_preprocessed_hashtag_mention_removed['tweet'] = summ_data_preprocessed_hashtag_mention_removed['tweet'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_hashtag_mention_removed['tweet'] = summ_data_preprocessed_hashtag_mention_removed['tweet'].apply(lambda x: x.lower())
summ_data_preprocessed_hashtag_mention_removed['tweet'] = summ_data_preprocessed_hashtag_mention_removed['tweet'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_hashtag_mention_removed['tweet'] = summ_data_preprocessed_hashtag_mention_removed['tweet'].apply(lambda x: x.strip())
summ_data_preprocessed_hashtag_mention_removed['claim_reviewed'] = summ_data_preprocessed_hashtag_mention_removed['claim_reviewed'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_hashtag_mention_removed['claim_reviewed'] = summ_data_preprocessed_hashtag_mention_removed['claim_reviewed'].apply(lambda x: x.lower())
summ_data_preprocessed_hashtag_mention_removed['claim_reviewed'] = summ_data_preprocessed_hashtag_mention_removed['claim_reviewed'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_hashtag_mention_removed['claim_reviewed'] = summ_data_preprocessed_hashtag_mention_removed['claim_reviewed'].apply(lambda x: x.strip())
summ_data_preprocessed_hashtag_mention_removed.head()

Unnamed: 0,tweet_url,tweet,evidence_url,claim_reviewed,current_owner_id,indian,user_id,claim_reviewed_markup_language,topic,tweet_language
7,https://twitter.com/GenPanwar/status/124234712...,corona vaccine is ready to be launched in usa,https://www.altnews.in/image-of-covid-19-test-...,roche medical company will launch the vaccine ...,4,1,1,English,Health,English
34,https://twitter.com/BhatkhalkarA/status/124529...,18 nations including usa and uk wants as leade...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
37,https://twitter.com/prettypadmaja/status/12457...,18 nations including usa and uk wants pm modi ...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
45,https://twitter.com/TrulyMonica/status/1345776...,190 countries have pre booked bharat biotech v...,https://www.altnews.in/india-tvs-rajat-sharma-...,190 countries have pre booked bharat biotech c...,4,1,1,English,Health,English
88,https://twitter.com/ShadRaza1/status/121946187...,s metro station if u do 30 situps ur ticket is...,https://www.altnews.in/fact-check-moscow-metro...,30 squats guarantees a ticket on the moscow metro,4,1,1,English,Health,English


In [None]:
summ_data_preprocessed_hashtag_mention_removed.to_csv('summ_data_eng_preprocessed_hashtag_mention_removed.csv')

## **Hashtag removed data**

In [None]:
summ_data_preprocessed_hashtag_removed = summ_data_raw.copy(deep=True)
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.HASHTAG)
summ_data_preprocessed_hashtag_removed['tweet'] = summ_data_preprocessed_hashtag_removed['tweet'].apply(lambda x: p.clean(x))
summ_data_preprocessed_hashtag_removed['tweet'] = summ_data_preprocessed_hashtag_removed['tweet'].apply(lambda x: x.replace('@', ' '))
summ_data_preprocessed_hashtag_removed['tweet'] = summ_data_preprocessed_hashtag_removed['tweet'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_hashtag_removed['tweet'] = summ_data_preprocessed_hashtag_removed['tweet'].apply(lambda x: x.lower())
summ_data_preprocessed_hashtag_removed['tweet'] = summ_data_preprocessed_hashtag_removed['tweet'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_hashtag_removed['tweet'] = summ_data_preprocessed_hashtag_removed['tweet'].apply(lambda x: x.strip())
summ_data_preprocessed_hashtag_removed['claim_reviewed'] = summ_data_preprocessed_hashtag_removed['claim_reviewed'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_hashtag_removed['claim_reviewed'] = summ_data_preprocessed_hashtag_removed['claim_reviewed'].apply(lambda x: x.lower())
summ_data_preprocessed_hashtag_removed['claim_reviewed'] = summ_data_preprocessed_hashtag_removed['claim_reviewed'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_hashtag_removed['claim_reviewed'] = summ_data_preprocessed_hashtag_removed['claim_reviewed'].apply(lambda x: x.strip())
summ_data_preprocessed_hashtag_removed.head()

Unnamed: 0,tweet_url,tweet,evidence_url,claim_reviewed,current_owner_id,indian,user_id,claim_reviewed_markup_language,topic,tweet_language
7,https://twitter.com/GenPanwar/status/124234712...,corona vaccine is ready to be launched in usa,https://www.altnews.in/image-of-covid-19-test-...,roche medical company will launch the vaccine ...,4,1,1,English,Health,English
34,https://twitter.com/BhatkhalkarA/status/124529...,18 nations including usa and uk wants narendra...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
37,https://twitter.com/prettypadmaja/status/12457...,18 nations including usa and uk wants pm modi ...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
45,https://twitter.com/TrulyMonica/status/1345776...,190 countries have pre booked bharat biotech v...,https://www.altnews.in/india-tvs-rajat-sharma-...,190 countries have pre booked bharat biotech c...,4,1,1,English,Health,English
88,https://twitter.com/ShadRaza1/status/121946187...,s metro station if u do 30 situps ur ticket is...,https://www.altnews.in/fact-check-moscow-metro...,30 squats guarantees a ticket on the moscow metro,4,1,1,English,Health,English


In [None]:
summ_data_preprocessed_hashtag_removed.to_csv('summ_data_eng_preprocessed_hashtag_removed.csv')

## **Mention removed data**

In [None]:
summ_data_preprocessed_mention_removed = summ_data_raw.copy(deep=True)
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION)
summ_data_preprocessed_mention_removed['tweet'] = summ_data_preprocessed_mention_removed['tweet'].apply(lambda x: p.clean(x))
summ_data_preprocessed_mention_removed['tweet'] = summ_data_preprocessed_mention_removed['tweet'].apply(lambda x: x.replace('#', ' '))
summ_data_preprocessed_mention_removed['tweet'] = summ_data_preprocessed_mention_removed['tweet'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_mention_removed['tweet'] = summ_data_preprocessed_mention_removed['tweet'].apply(lambda x: x.lower())
summ_data_preprocessed_mention_removed['tweet'] = summ_data_preprocessed_mention_removed['tweet'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_mention_removed['tweet'] = summ_data_preprocessed_mention_removed['tweet'].apply(lambda x: x.strip())
summ_data_preprocessed_mention_removed['claim_reviewed'] = summ_data_preprocessed_mention_removed['claim_reviewed'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_mention_removed['claim_reviewed'] = summ_data_preprocessed_mention_removed['claim_reviewed'].apply(lambda x: x.lower())
summ_data_preprocessed_mention_removed['claim_reviewed'] = summ_data_preprocessed_mention_removed['claim_reviewed'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_mention_removed['claim_reviewed'] = summ_data_preprocessed_mention_removed['claim_reviewed'].apply(lambda x: x.strip())
summ_data_preprocessed_mention_removed.head()

Unnamed: 0,tweet_url,tweet,evidence_url,claim_reviewed,current_owner_id,indian,user_id,claim_reviewed_markup_language,topic,tweet_language
7,https://twitter.com/GenPanwar/status/124234712...,corona vaccine is ready to be launched in usa,https://www.altnews.in/image-of-covid-19-test-...,roche medical company will launch the vaccine ...,4,1,1,English,Health,English
34,https://twitter.com/BhatkhalkarA/status/124529...,18 nations including usa and uk wants as leade...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
37,https://twitter.com/prettypadmaja/status/12457...,18 nations including usa and uk wants pm modi ...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
45,https://twitter.com/TrulyMonica/status/1345776...,190 countries have pre booked bharat biotech v...,https://www.altnews.in/india-tvs-rajat-sharma-...,190 countries have pre booked bharat biotech c...,4,1,1,English,Health,English
88,https://twitter.com/ShadRaza1/status/121946187...,russia s metro station if u do 30 situps ur ti...,https://www.altnews.in/fact-check-moscow-metro...,30 squats guarantees a ticket on the moscow metro,4,1,1,English,Health,English


In [None]:
summ_data_preprocessed_mention_removed.to_csv('summ_data_eng_preprocessed_mention_removed.csv')

## **Hashtag Mention run removed data**

In [None]:
def remove_run_of_mentions(text):
  word_list = text.split(' ')
  men_mask = []
  for i in range(0, len(word_list)):
    if word_list[i].startswith('@'):
      men_mask.append(1)
    else:
      men_mask.append(0)

  for i in range(len(men_mask)-1, 0, -1):
    if men_mask[i]==1 and men_mask[i-1]==1:
      men_mask[i]=-1

  reconstructed_word_list = []
  for i in range(0, len(word_list)):
    if(men_mask[i]!=-1):
      reconstructed_word_list.append(word_list[i])
      
  return ' '.join(reconstructed_word_list)

def remove_run_of_hashtags(text):
  word_list = text.split(' ')
  hash_mask = []
  for i in range(0, len(word_list)):
    if word_list[i].startswith('#'):
      hash_mask.append(1)
    else:
      hash_mask.append(0)

  for i in range(len(hash_mask)-1, 0, -1):
    if hash_mask[i]==1 and hash_mask[i-1]==1:
      hash_mask[i]=-1

  reconstructed_word_list = []
  for i in range(0, len(word_list)):
    if(hash_mask[i]!=-1):
      reconstructed_word_list.append(word_list[i])

  return ' '.join(reconstructed_word_list)





In [None]:
summ_data_preprocessed_hashtag_mention_run_removed = summ_data_raw.copy(deep=True)
p.set_options(p.OPT.URL, p.OPT.EMOJI)
summ_data_preprocessed_hashtag_mention_run_removed['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed['tweet'].apply(lambda x: p.clean(x))
summ_data_preprocessed_hashtag_mention_run_removed['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed['tweet'].apply(lambda x: remove_run_of_mentions(x))
summ_data_preprocessed_hashtag_mention_run_removed['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed['tweet'].apply(lambda x: remove_run_of_hashtags(x))
summ_data_preprocessed_hashtag_mention_run_removed['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed['tweet'].apply(lambda x: x.replace('#',' ').replace('@', ' '))
summ_data_preprocessed_hashtag_mention_run_removed['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed['tweet'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_hashtag_mention_run_removed['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed['tweet'].apply(lambda x: x.lower())
summ_data_preprocessed_hashtag_mention_run_removed['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed['tweet'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_hashtag_mention_run_removed['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed['tweet'].apply(lambda x: x.strip())
summ_data_preprocessed_hashtag_mention_run_removed['claim_reviewed'] = summ_data_preprocessed_hashtag_mention_run_removed['claim_reviewed'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_hashtag_mention_run_removed['claim_reviewed'] = summ_data_preprocessed_hashtag_mention_run_removed['claim_reviewed'].apply(lambda x: x.lower())
summ_data_preprocessed_hashtag_mention_run_removed['claim_reviewed'] = summ_data_preprocessed_hashtag_mention_run_removed['claim_reviewed'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_hashtag_mention_run_removed['claim_reviewed'] = summ_data_preprocessed_hashtag_mention_run_removed['claim_reviewed'].apply(lambda x: x.strip())
summ_data_preprocessed_hashtag_mention_run_removed.head()

Unnamed: 0,tweet_url,tweet,evidence_url,claim_reviewed,current_owner_id,indian,user_id,claim_reviewed_markup_language,topic,tweet_language
7,https://twitter.com/GenPanwar/status/124234712...,corona vaccine is ready to be launched in usa,https://www.altnews.in/image-of-covid-19-test-...,roche medical company will launch the vaccine ...,4,1,1,English,Health,English
34,https://twitter.com/BhatkhalkarA/status/124529...,18 nations including usa and uk wants narendra...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
37,https://twitter.com/prettypadmaja/status/12457...,18 nations including usa and uk wants pm modi ...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
45,https://twitter.com/TrulyMonica/status/1345776...,190 countries have pre booked bharat biotech v...,https://www.altnews.in/india-tvs-rajat-sharma-...,190 countries have pre booked bharat biotech c...,4,1,1,English,Health,English
88,https://twitter.com/ShadRaza1/status/121946187...,russia s metro station if u do 30 situps ur ti...,https://www.altnews.in/fact-check-moscow-metro...,30 squats guarantees a ticket on the moscow metro,4,1,1,English,Health,English


In [None]:
summ_data_preprocessed_hashtag_mention_run_removed.to_csv('summ_data_eng_preprocessed_hashtag_mention_run_removed.csv')

## **Hashtag Mention run removed Mention replaced data**

In [None]:
# token taken from https://oauth-playground.glitch.me/?id=findUserByUsername&params=%28%27query%21%28%29%7Ebody%21%27%27%7Epath%21%28%27*%7E**username%21%27TwitterDev%27%29%01*_
endpoint = "https://api.twitter.com/2/users/by/username/"
headers = {"Authorization": "<your_token>"}

In [None]:
def replace_handle_with_name(text):
  word_list = text.split(' ')
  
  for i in range(0, len(word_list)):
    if word_list[i].startswith('@'):
      try:
        word_list[i] = requests.get(endpoint+word_list[i][1:], headers=headers).json()['data']['name']
        print(word_list[i])
      except:
        continue

  return ' '.join(word_list)

In [None]:
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced = summ_data_raw.copy(deep=True)
p.set_options(p.OPT.URL, p.OPT.EMOJI)
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'].apply(lambda x: p.clean(x))
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'].apply(lambda x: remove_run_of_mentions(x))
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'].apply(lambda x: replace_handle_with_name(x))
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'].apply(lambda x: remove_run_of_hashtags(x))
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'].apply(lambda x: x.replace('#',' '))
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'].apply(lambda x: x.lower())
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['tweet'].apply(lambda x: x.strip())
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['claim_reviewed'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['claim_reviewed'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['claim_reviewed'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['claim_reviewed'].apply(lambda x: x.lower())
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['claim_reviewed'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['claim_reviewed'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['claim_reviewed'] = summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced['claim_reviewed'].apply(lambda x: x.strip())
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced.head()

Narendra Modi
Narendra Modi
Dionisios Favatas
Akshay Kumar
ANI
Tarek Fatah
Deepika Padukone
Rubika Liyaquat
Delhi Police
निधि त्रिपाठी நிதி திரிபாதி
Josh Hawley
barkha dutt
Narendra Modi
Prime Minister's Office
Alt News
Delhi Police
Shehzad Jai Hind
Rahul Gandhi
Amit Shah
Indian Air Force
Tajinder Pal Singh Bagga
KTR
BJP Telangana
Congress
Narendra Modi
Mumbai Police
Saba Naqvi
Ministry of Health
माझी Mumbai, आपली BMC
Nitin Gadkari
Amit Shah
Siddharth Bakaria 🇮🇳
Javed Akhtar
Zee News
News18 Tamil Nadu
S Narayanan
Mumbai Police
RSS
BOOM Live
BOOM Live
Kanhaiya Kumar
BOOM Live
PMO India
Office of Uddhav Thackeray
Akhilesh P. Singh
Mohammed Zubair
Pratik Sinha
Mohit Gulati 🇮🇳
BOOM Live
SM Hoax Slayer
Alt News
AAP
Dr. S. Jaishankar
Imran Khan
Saket Gokhale
CMO Punjab
Al Jazeera English
Shah Mahmood Qureshi
Republic
Republic
Saurabh Gupta(Micky)
United Nations
Amit Shah
गृहमंत्री कार्यालय, HMO India
Nirmala Sitharaman
RSS
Tejasvi Surya
Yogita Bhayana योगिता भयाना
Narendra Modi
PMO India
Des

Unnamed: 0,tweet_url,tweet,evidence_url,claim_reviewed,current_owner_id,indian,user_id,claim_reviewed_markup_language,topic,tweet_language
7,https://twitter.com/GenPanwar/status/124234712...,corona vaccine is ready to be launched in usa,https://www.altnews.in/image-of-covid-19-test-...,roche medical company will launch the vaccine ...,4,1,1,English,Health,English
34,https://twitter.com/BhatkhalkarA/status/124529...,18 nations including usa and uk wants narendra...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
37,https://twitter.com/prettypadmaja/status/12457...,18 nations including usa and uk wants pm modi ...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
45,https://twitter.com/TrulyMonica/status/1345776...,190 countries have pre booked bharat biotech v...,https://www.altnews.in/india-tvs-rajat-sharma-...,190 countries have pre booked bharat biotech c...,4,1,1,English,Health,English
88,https://twitter.com/ShadRaza1/status/121946187...,russia s metro station if u do 30 situps ur ti...,https://www.altnews.in/fact-check-moscow-metro...,30 squats guarantees a ticket on the moscow metro,4,1,1,English,Health,English


In [None]:
summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced.to_csv('summ_data_eng_preprocessed_hashtag_mention_run_removed_mention_replaced.csv')

## **Preprocessed with 'Image/Video/Clip Shows' removed from summary**

In [None]:
def remove_shows_summary(text):
  word_list = text.split(' ')
  if len(word_list)>1 and word_list[0] in ['clip','image','video'] and word_list[1]=='shows':
    word_list = word_list[2:]

  return ' '.join(word_list)

In [None]:
summ_data_preprocessed_shows_removed = summ_data_raw.copy(deep=True)
p.set_options(p.OPT.URL, p.OPT.EMOJI)
summ_data_preprocessed_shows_removed['tweet'] = summ_data_preprocessed_shows_removed['tweet'].apply(lambda x: p.clean(x))
summ_data_preprocessed_shows_removed['tweet'] = summ_data_preprocessed_shows_removed['tweet'].apply(lambda x: x.replace('#',' ').replace('@', ' '))
summ_data_preprocessed_shows_removed['tweet'] = summ_data_preprocessed_shows_removed['tweet'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_shows_removed['tweet'] = summ_data_preprocessed_shows_removed['tweet'].apply(lambda x: x.lower())
summ_data_preprocessed_shows_removed['tweet'] = summ_data_preprocessed_shows_removed['tweet'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_shows_removed['tweet'] = summ_data_preprocessed_shows_removed['tweet'].apply(lambda x: x.strip())
summ_data_preprocessed_shows_removed['claim_reviewed'] = summ_data_preprocessed_shows_removed['claim_reviewed'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
summ_data_preprocessed_shows_removed['claim_reviewed'] = summ_data_preprocessed_shows_removed['claim_reviewed'].apply(lambda x: x.lower())
summ_data_preprocessed_shows_removed['claim_reviewed'] = summ_data_preprocessed_shows_removed['claim_reviewed'].apply(lambda x: remove_shows_summary(x))
summ_data_preprocessed_shows_removed['claim_reviewed'] = summ_data_preprocessed_shows_removed['claim_reviewed'].str.replace('\s+', ' ', regex=True)
summ_data_preprocessed_shows_removed['claim_reviewed'] = summ_data_preprocessed_shows_removed['claim_reviewed'].apply(lambda x: x.strip())
summ_data_preprocessed_shows_removed.head()

Unnamed: 0,tweet_url,tweet,evidence_url,claim_reviewed,current_owner_id,indian,user_id,claim_reviewed_markup_language,topic,tweet_language
7,https://twitter.com/GenPanwar/status/124234712...,corona vaccine is ready to be launched in usa,https://www.altnews.in/image-of-covid-19-test-...,roche medical company will launch the vaccine ...,4,1,1,English,Health,English
34,https://twitter.com/BhatkhalkarA/status/124529...,18 nations including usa and uk wants narendra...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
37,https://twitter.com/prettypadmaja/status/12457...,18 nations including usa and uk wants pm modi ...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including usa and uk want pm modi t...,4,1,1,English,Health,English
45,https://twitter.com/TrulyMonica/status/1345776...,190 countries have pre booked bharat biotech v...,https://www.altnews.in/india-tvs-rajat-sharma-...,190 countries have pre booked bharat biotech c...,4,1,1,English,Health,English
88,https://twitter.com/ShadRaza1/status/121946187...,russia s metro station if u do 30 situps ur ti...,https://www.altnews.in/fact-check-moscow-metro...,30 squats guarantees a ticket on the moscow metro,4,1,1,English,Health,English


In [None]:
summ_data_preprocessed_shows_removed.to_csv('summ_data_eng_preprocessed_shows_removed.csv')

## **Raw with 'Image/Video/Clip Shows' removed from summary**

In [None]:
def remove_shows_summary(text):
  word_list = text.split(' ')
  if len(word_list)>1 and word_list[0].lower() in ['clip','image','video'] and word_list[1].lower() =='shows':
    word_list = word_list[2:]

  return ' '.join(word_list)

In [None]:
summ_data_raw_shows_removed = summ_data_raw.copy(deep=True)
summ_data_raw_shows_removed['claim_reviewed'] = summ_data_raw_shows_removed['claim_reviewed'].apply(lambda x: remove_shows_summary(x))
summ_data_raw_shows_removed.head()

Unnamed: 0,tweet_url,tweet,evidence_url,claim_reviewed,current_owner_id,indian,user_id,claim_reviewed_markup_language,topic,tweet_language
7,https://twitter.com/GenPanwar/status/124234712...,Corona Vaccine is ready to be launched in USA.,https://www.altnews.in/image-of-covid-19-test-...,"""Roche Medical company"" will launch the vaccin...",4,1,1,English,Health,English
34,https://twitter.com/BhatkhalkarA/status/124529...,18 nations including USA and UK wants @narendr...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including USA and UK want PM Modi t...,4,1,1,English,Health,English
37,https://twitter.com/prettypadmaja/status/12457...,18 nations including USA and UK wants PM Modi ...,https://www.altnews.in/no-us-and-uk-have-not-s...,18 nations including USA and UK want PM Modi t...,4,1,1,English,Health,English
45,https://twitter.com/TrulyMonica/status/1345776...,190 countries have pre-booked Bharat Biotech v...,https://www.altnews.in/india-tvs-rajat-sharma-...,190 countries have pre-booked Bharat Biotech C...,4,1,1,English,Health,English
88,https://twitter.com/ShadRaza1/status/121946187...,"#Russia's metro station, if u do 30 situps ur ...",https://www.altnews.in/fact-check-moscow-metro...,30 squats guarantees a ticket on the Moscow metro,4,1,1,English,Health,English


In [None]:
summ_data_raw_shows_removed.to_csv('summ_data_eng_eng_raw_shows_removed.csv')

# **Checking for Duplicates**

In [None]:
len(summ_data_preprocessed) - len(summ_data_preprocessed.drop_duplicates(subset=['tweet','claim_reviewed']))

2

In [None]:
len(summ_data_preprocessed_emoji_replaced) - len(summ_data_preprocessed_emoji_replaced.drop_duplicates(subset=['tweet','claim_reviewed']))

1

In [None]:
len(summ_data_preprocessed_hashtag_mention_removed) - len(summ_data_preprocessed_hashtag_mention_removed.drop_duplicates(subset=['tweet','claim_reviewed']))

3

In [None]:
len(summ_data_preprocessed_hashtag_mention_run_removed) - len(summ_data_preprocessed_hashtag_mention_run_removed.drop_duplicates(subset=['tweet','claim_reviewed']))

2

In [None]:
len(summ_data_preprocessed_hashtag_removed) - len(summ_data_preprocessed_hashtag_removed.drop_duplicates(subset=['tweet','claim_reviewed']))

2

In [None]:
len(summ_data_preprocessed_mention_removed) - len(summ_data_preprocessed_mention_removed.drop_duplicates(subset=['tweet','claim_reviewed']))

2

In [None]:
len(summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced) - len(summ_data_preprocessed_hashtag_mention_run_removed_mention_replaced.drop_duplicates(subset=['tweet','claim_reviewed']))

2