In [None]:
!pip install symspellpy
!pip install ftfy
!pip install beautifulsoup4
!pip install unidecode
!pip install contractions
!pip install emoji
!pip install NRCLex

Collecting NRCLex
  Downloading NRCLex-4.0-py3-none-any.whl.metadata (3.2 kB)
INFO: pip is looking at multiple versions of nrclex to determine which version is compatible with other requirements. This could take a while.
  Downloading NRCLex-3.0.0.tar.gz (396 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m396.4/396.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NRCLex
  Building wheel for NRCLex (setup.py) ... [?25l[?25hdone
  Created wheel for NRCLex: filename=NRCLex-3.0.0-py3-none-any.whl size=43309 sha256=ef8e81b03af19170dad870d2c23030ae482eeada8936551c772ac917e146e0b8
  Stored in directory: /root/.cache/pip/wheels/d2/10/44/6abfb1234298806a145fd6bcaec8cbc712e88dd1cd6cb242fa
Successfully built NRCLex
Installing collected packages: NRCLex
Successfully installed NRCLex-3.0.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from symspellpy import SymSpell, Verbosity
import urllib.request
import os
import re
import ftfy
from bs4 import BeautifulSoup
import unidecode
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import emoji
import pkg_resources
from nrclex import NRCLex
from collections import Counter

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Combining the dataset

In [None]:
# Load datasets
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FYP/fake_news_dataset/fake_news_train.tsv', sep='\t')
validate_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FYP/fake_news_dataset/fake_news_validate.tsv', sep='\t')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FYP/fake_news_dataset/fake_news_test.tsv', sep='\t')

# Combine into on DataFrama
combined_df = pd.concat([train_df, validate_df, test_df], ignore_index=True)

combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682661 entries, 0 to 682660
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   author                647820 non-null  object 
 1   clean_title           682661 non-null  object 
 2   created_utc           682661 non-null  float64
 3   domain                479522 non-null  object 
 4   hasImage              682661 non-null  bool   
 5   id                    682661 non-null  object 
 6   image_url             680798 non-null  object 
 7   linked_submission_id  203139 non-null  object 
 8   num_comments          479522 non-null  float64
 9   score                 682661 non-null  int64  
 10  subreddit             682661 non-null  object 
 11  title                 682661 non-null  object 
 12  upvote_ratio          479522 non-null  float64
 13  2_way_label           682661 non-null  int64  
 14  3_way_label           682661 non-null  int64  
 15  

### Data cleaning

In [None]:
# Remove rows with empty of missing 'clean_title'
combined_df = combined_df.dropna(subset=['clean_title'])
combined_df = combined_df[combined_df['clean_title'].str.strip() != '']

# Remove rows with duplicate 'clean_title'
combined_df = combined_df.drop_duplicates(subset='clean_title', keep='first')

# Remove rows where 'num_comments' is not greater than 0
combined_df = combined_df[combined_df['num_comments'] > 0]

combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 353677 entries, 0 to 682660
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   author                337248 non-null  object 
 1   clean_title           353677 non-null  object 
 2   created_utc           353677 non-null  float64
 3   domain                353677 non-null  object 
 4   hasImage              353677 non-null  bool   
 5   id                    353677 non-null  object 
 6   image_url             352226 non-null  object 
 7   linked_submission_id  0 non-null       object 
 8   num_comments          353677 non-null  float64
 9   score                 353677 non-null  int64  
 10  subreddit             353677 non-null  object 
 11  title                 353677 non-null  object 
 12  upvote_ratio          353677 non-null  float64
 13  2_way_label           353677 non-null  int64  
 14  3_way_label           353677 non-null  int64  
 15  6_way

In [None]:
# Load the comments dataset
comments_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FYP/fake_news_dataset/all_comments.tsv', sep='\t', usecols=['body', 'submission_id'], dtype={'body': str, 'submission_id': str})

comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10697533 entries, 0 to 10697532
Data columns (total 2 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   body           object
 1   submission_id  object
dtypes: object(2)
memory usage: 163.2+ MB


In [None]:
# Remove rows with empty or missing 'body'
comments_df = comments_df.dropna(subset=['body'])
comments_df = comments_df[comments_df['body'].str.strip() != '']

# Remove rows where 'body' is '[deleted]' or '[removed]'
comments_df = comments_df[~comments_df['body'].isin(['[deleted]', '[removed]'])]

# Remove rows with duplicate 'body' values
comments_df = comments_df.drop_duplicates(subset='body', keep='first')

comments_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8963539 entries, 0 to 10697532
Data columns (total 2 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   body           object
 1   submission_id  object
dtypes: object(2)
memory usage: 205.2+ MB


### Convert emoji to text

In [None]:
def convert_emoji_to_text(text):
    return emoji.demojize(text, delimiters=(":", ":"))

comments_df['body'] = comments_df['body'].apply(convert_emoji_to_text)

### Fixed mojibake text


In [None]:
def fix_mojibake_text(text):
  return ftfy.fix_text(text)

comments_df['body'] = comments_df['body'].apply(fix_mojibake_text)

In [None]:
def remove_user_links(text):
  # Remove /user/... links
  text = re.sub(r'/user/\S+', '', text)

  return text

comments_df['body'] = comments_df['body'].apply(remove_user_links)

### Remove newlines & Tabs

In [None]:
def remove_newlines_tabs(text):
    # Replacing all the occurrences of \n,\\n,\t,\\ with a space.
    formatted_text = text.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')
    return formatted_text

comments_df['body'] = comments_df['body'].apply(remove_newlines_tabs)

### Strip HTML Tags

In [None]:
def strip_html_tags(text):
    # Initiating BeautifulSoup object soup.
    soup = BeautifulSoup(text, "html.parser")
    # Get all the text other than html tags.
    stripped_text = soup.get_text(separator=" ")
    return stripped_text

comments_df['body'] = comments_df['body'].apply(strip_html_tags)

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


### Remove links

In [None]:
def remove_links(text):
   return re.sub(r'http\S+|www\S+|https\S+', '', text)

comments_df['body'] = comments_df['body'].apply(remove_links)

### Remove accented characters

In [None]:
def remove_accented_chars(text):
    return unidecode.unidecode(text)

comments_df['body'] = comments_df['body'].apply(remove_accented_chars)

### Expanding Contractions

In [None]:
def expand_contractions(text):
    return contractions.fix(text)

comments_df['body'] = comments_df['body'].apply(expand_contractions)

### Remove special characters

In [None]:
def removing_special_characters(text):
    return re.sub(r"[^a-zA-Z0-9:$-,%.@_#""'']+", ' ', text)

comments_df['body'] = comments_df['body'].apply(removing_special_characters)

### Remove repeated characters

In [None]:
def remove_repeated_characters(text):
    return re.sub(r'(.)\1+', r'\1\1', text)

comments_df['body'] = comments_df['body'].apply(remove_repeated_characters)

### Correcting misspelled words

In [None]:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt"
)
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

True

In [None]:
def correct_spelling_symspell(text):
    words = [
        sym_spell.lookup(
            word,
            Verbosity.CLOSEST,
            max_edit_distance=2,
            include_unknown=True
            )[0].term
        for word in text.split()]
    text = " ".join(words)
    return text

comments_df['body'] = comments_df['body'].apply(correct_spelling_symspell)

### Remove stopwords

In [None]:
stoplist = stopwords.words('english')
stoplist = set(stoplist)
def removing_stopwords(text):
    # repr() function actually gives the precise information about the string
    text = repr(text)
    # Text without stopwords
    No_StopWords = [word for word in word_tokenize(text) if word.lower() not in stoplist ]
    # Convert list of tokens_without_stopwords to String type.
    words_string = ' '.join(No_StopWords)
    return words_string

comments_df['body'] = comments_df['body'].apply(removing_stopwords)

In [None]:
comments_df = comments_df[comments_df['body'].str.strip() != '']

comments_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8963539 entries, 0 to 10697532
Data columns (total 2 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   body           object
 1   submission_id  object
dtypes: object(2)
memory usage: 205.2+ MB


In [None]:
# Filtered the comments dataset to keep only rows where 'submission_id' is in the combined dataset
cleaned_comments_df = comments_df[comments_df['submission_id'].isin(combined_df['id'])]

# Save the cleaned comments dataset
cleaned_comments_df.to_csv('/content/drive/MyDrive/Colab Notebooks/FYP/fake_news_dataset/cleaned_comments.tsv', sep='\t', index=False)

cleaned_comments_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5430829 entries, 0 to 10697532
Data columns (total 2 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   body           object
 1   submission_id  object
dtypes: object(2)
memory usage: 124.3+ MB


In [None]:
combined_df = combined_df[combined_df['id'].isin(cleaned_comments_df['submission_id'])]

combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 328710 entries, 0 to 682660
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   author                313610 non-null  object 
 1   clean_title           328710 non-null  object 
 2   created_utc           328710 non-null  float64
 3   domain                328710 non-null  object 
 4   hasImage              328710 non-null  bool   
 5   id                    328710 non-null  object 
 6   image_url             327567 non-null  object 
 7   linked_submission_id  0 non-null       object 
 8   num_comments          328710 non-null  float64
 9   score                 328710 non-null  int64  
 10  subreddit             328710 non-null  object 
 11  title                 328710 non-null  object 
 12  upvote_ratio          328710 non-null  float64
 13  2_way_label           328710 non-null  int64  
 14  3_way_label           328710 non-null  int64  
 15  6_way

## Emotional Analysis

In [None]:
def analyze_emotions(text):
    emotion_analyzer = NRCLex(text)
    return emotion_analyzer.affect_frequencies

def categorize_emotion_group(emotions):
    novelty_emotions = ['fear', 'disgust', 'surprise']
    expectation_emotions = ['anticipation', 'sadness', 'joy', 'trust']

    novelty_score = sum(emotions.get(emotion, 0) for emotion in novelty_emotions)
    expectation_score = sum(emotions.get(emotion, 0) for emotion in expectation_emotions)

    if novelty_score > expectation_score:
        return 'novelty'
    elif expectation_score > novelty_score:
        return 'expectation'
    else:
        return 'neutral'

def process_comments(df):
    # Create a copy of the dataframe
    df = df.copy()

    # Use .loc to set the value of a specific cell
    df.loc[:, 'emotions'] = df['body'].apply(analyze_emotions)
    df.loc[:, 'emotion_group'] = df['emotions'].apply(categorize_emotion_group)

    return df

def aggregate_emotion_groups(comment_df, news_df):
    grouped = comment_df.groupby('submission_id')['emotion_group'].agg(lambda  x: Counter(x).most_common(1)[0][0])
    news_df = news_df.copy()
    news_df.loc[:, 'emotion_group'] = news_df['id'].map(grouped)

    # Convert emotion groups to numerical values
    emotion_map = { 'expectation': 0, 'neutral': 0.5, 'novelty': 1 }
    news_df.loc[:, 'emotion_group'] = news_df['emotion_group'].map(emotion_map)

    return news_df

In [None]:
process_comments = process_comments(cleaned_comments_df)

In [None]:
aggregated_df = aggregate_emotion_groups(process_comments, combined_df)

aggregated_df.info()
aggregated_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 328710 entries, 0 to 682660
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   author                313610 non-null  object 
 1   clean_title           328710 non-null  object 
 2   created_utc           328710 non-null  float64
 3   domain                328710 non-null  object 
 4   hasImage              328710 non-null  bool   
 5   id                    328710 non-null  object 
 6   image_url             327567 non-null  object 
 7   linked_submission_id  0 non-null       object 
 8   num_comments          328710 non-null  float64
 9   score                 328710 non-null  int64  
 10  subreddit             328710 non-null  object 
 11  title                 328710 non-null  object 
 12  upvote_ratio          328710 non-null  float64
 13  2_way_label           328710 non-null  int64  
 14  3_way_label           328710 non-null  int64  
 15  6_way

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,3_way_label,6_way_label,emotion_group
0,Alexithymia,my walgreens offbrand mucinex was engraved wit...,1551641000.0,i.imgur.com,True,awxhir,https://external-preview.redd.it/WylDbZrnbvZdB...,,2.0,12,mildlyinteresting,My Walgreens offbrand Mucinex was engraved wit...,0.84,1,0,0,0.5
1,VIDCAs17,this concerned sink with a tiny hat,1534727000.0,i.redd.it,True,98pbid,https://preview.redd.it/wsfx0gp0f5h11.jpg?widt...,,2.0,119,pareidolia,This concerned sink with a tiny hat,0.99,0,2,2,0.5
3,,puppy taking in the view,1471341000.0,i.imgur.com,True,4xypkv,https://external-preview.redd.it/HLtVNhTR6wtYt...,,26.0,250,photoshopbattles,PsBattle: Puppy taking in the view,0.95,1,0,0,0.5
4,3rikR3ith,i found a face in my sheet music too,1525318000.0,i.redd.it,True,8gnet9,https://preview.redd.it/ri7ut2wn8kv01.jpg?widt...,,2.0,13,pareidolia,I found a face in my sheet music too!,0.84,0,2,2,0.0
5,CrimsonBlue90,bride and groom exchange vows after fatal shoo...,1423681000.0,independent.ie,True,2vkbtj,https://external-preview.redd.it/FQ-J9OIPFRpqi...,,7.0,6,nottheonion,Bride and groom exchange vows after fatal shoo...,0.64,1,0,0,0.5


In [None]:
aggregated_df.loc[:, 'real'] = aggregated_df['2_way_label'].apply(lambda x: 1 if x == 0 else 0)
aggregated_df.loc[:, 'fake'] = aggregated_df['2_way_label'].apply(lambda x: 1 if x == 1 else 0)

aggregated_df.to_csv('/content/drive/MyDrive/Colab Notebooks/FYP/fake_news_dataset/final_combined_dataset.tsv', sep='\t', index=False)

aggregated_df.info()
aggregated_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 328710 entries, 0 to 682660
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   author                313610 non-null  object 
 1   clean_title           328710 non-null  object 
 2   created_utc           328710 non-null  float64
 3   domain                328710 non-null  object 
 4   hasImage              328710 non-null  bool   
 5   id                    328710 non-null  object 
 6   image_url             327567 non-null  object 
 7   linked_submission_id  0 non-null       object 
 8   num_comments          328710 non-null  float64
 9   score                 328710 non-null  int64  
 10  subreddit             328710 non-null  object 
 11  title                 328710 non-null  object 
 12  upvote_ratio          328710 non-null  float64
 13  2_way_label           328710 non-null  int64  
 14  3_way_label           328710 non-null  int64  
 15  6_way

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,3_way_label,6_way_label,emotion_group,real,fake
0,Alexithymia,my walgreens offbrand mucinex was engraved wit...,1551641000.0,i.imgur.com,True,awxhir,https://external-preview.redd.it/WylDbZrnbvZdB...,,2.0,12,mildlyinteresting,My Walgreens offbrand Mucinex was engraved wit...,0.84,1,0,0,0.5,0,1
1,VIDCAs17,this concerned sink with a tiny hat,1534727000.0,i.redd.it,True,98pbid,https://preview.redd.it/wsfx0gp0f5h11.jpg?widt...,,2.0,119,pareidolia,This concerned sink with a tiny hat,0.99,0,2,2,0.5,1,0
3,,puppy taking in the view,1471341000.0,i.imgur.com,True,4xypkv,https://external-preview.redd.it/HLtVNhTR6wtYt...,,26.0,250,photoshopbattles,PsBattle: Puppy taking in the view,0.95,1,0,0,0.5,0,1
4,3rikR3ith,i found a face in my sheet music too,1525318000.0,i.redd.it,True,8gnet9,https://preview.redd.it/ri7ut2wn8kv01.jpg?widt...,,2.0,13,pareidolia,I found a face in my sheet music too!,0.84,0,2,2,0.0,1,0
5,CrimsonBlue90,bride and groom exchange vows after fatal shoo...,1423681000.0,independent.ie,True,2vkbtj,https://external-preview.redd.it/FQ-J9OIPFRpqi...,,7.0,6,nottheonion,Bride and groom exchange vows after fatal shoo...,0.64,1,0,0,0.5,0,1
