### Fake News Project

By: Mateo Anusic, Emil Thorlund, Lucas A. Rosing, Victor Bergien

### Task #1

- Structure, process and clean the text.
- Tokenize the text
- Remove stopwords and compute the size of the vocabulary.
- Compute the reduction rate of the vocabulary size after removing stopwords.
- Remove word variations with stemming and compute the size of the vocabulary.
- Compute the reduction rate of the vocabulary size after stemming.

Describe which procedures (and which libraries) you used and why they are appropriate.

In [465]:
### Code ###
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from cleantext import clean 
import cleantext
import csv
import requests
from io import StringIO
from itertools import islice


data_url = 'https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv'

#nltk.download('punkt')

response = requests.get(data_url)
response.raise_for_status()  #Raise exeption

csv_data = response.content.decode('utf-8')
csv_file = StringIO(csv_data)

reader = csv.DictReader(csv_file)

start_row = 100
end_row = 102

subset_rows = list(islice(reader, start_row, end_row))

#for row_number, row in enumerate(subset_rows, start=start_row):
#    print(f"Row {row_number}:")
#    for column_name, cell_value in row.items():
#        print(f"  {column_name}: {cell_value}")
#    print()  # Print an empty line to separate rows



In [466]:
date_pattern = re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}.\d{6})|'            # YYYY-MM-DD HH:MM:SS.MMMMMM
                        r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})|'                      # YYYY-MM-DD HH:MM:SS
                        r'(\d{4}-\d{2}-\d{2})|'                                        # YYYY-MM-DD
                        r'(\d{4}\.\d{2}\.\d{2})|'                                      # YYYY.MM.DD 
                        r'(\d{2}\.\d{2}\.\d{4})|'                                      # DD.MM.YYYY
                        r'(\d{4}/\d{2}/\d{2})|'                                        # YYYY/MM/DD
                        r'(\d{2}/\d{2}/\d{4})|'                                        # DD/MM/YYYY
                        r'((january|february|march|april|june|july|august|september|'  # <Month> DD YYYY
                        r'october|november|december) \d{2}, \d{4})', re.IGNORECASE)  
number_pattern = re.compile(r'(\d+(?:,\d{3})*(?:\.\d+)?)')
url_pattern = re.compile(r'https?://\S+|www\.\S+|\S+\.com')

def clean_text_and_tokenize(read):
    read = read.lower()
    read = re.sub(r"\s+", " ", read)
    read = re.sub(date_pattern, '<DATE>', read)
    read = re.sub(number_pattern, "<NUM>", read)
    read = re.sub(r"\S+@\S+", "<EMAIL>", read)
    read = re.sub(url_pattern, "<URL>", read)
    tokens = word_tokenize(read)  # Tokenize the text
    return tokens

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

def stem_tokens(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    return stemmed_tokens

In [467]:
for row_number, row in enumerate(subset_rows, start=start_row):
    print(f"Row {row_number}:")
    for column_name, cell_value in row.items():
        tokens = clean_text_and_tokenize(cell_value)  # Clean text and tokenize
        no_stopwords_tokens = remove_stopwords(tokens)
        stemmed_tokens = stem_tokens(no_stopwords_tokens)
        cleaned_cell_value = ' '.join(stemmed_tokens)
        print(f"  {column_name}: {cleaned_cell_value}")
    print()


Row 100:
  : < num >
  id: < num >
  domain: < url >
  type: fake
  url: < url >
  content: greenmedinfo – action item link % reader think stori fact . add two cent . headlin : bitcoin & blockchain search exceed trump ! blockchain stock next ! one link greedmedinfo updat incomplet . letter write campaign locat : make fda advisori , mandatori sourc : < url >
  scraped_at: < date >
  inserted_at: < date >
  updated_at: < date >
  title: greenmedinfo – action item link
  authors: downsiz dc
  keywords: 
  meta_keywords: [ `` ]
  meta_description: 
  tags: 
  summary: 

Row 101:
  : < num >
  id: < num >
  domain: < url >
  type: fake
  url: < url >
  content: < num > annoy twitter auto dm headlin : bitcoin & blockchain search exceed trump ! blockchain stock next ! seen “ cheap supplement < num > ” “ regist busi program ’ receiv endless benefits. ” like thought email spam . ’ real exampl spam , twitter . last week wrote < num > worst social media mistak . one mistak annoy auto direct messa

In [468]:
test_line = "This is an example sentence."
print(clean_text_and_tokenize(test_line))
print(remove_stopwords(clean_text_and_tokenize(test_line)))
print(stem_tokens(remove_stopwords(clean_text_and_tokenize(test_line))))

['this', 'is', 'an', 'example', 'sentence', '.']
['example', 'sentence', '.']
['exampl', 'sentenc', '.']


In [469]:

def count_unique_words(documents):
    word_set = set()
    for document in documents:
        tokens = clean_text_and_tokenize(document)  
        word_set.update(tokens)
    return len(word_set)

# Count unique words before removing stopwords and stemming
unique_words = set(word_tokenize(csv_data.lower()))

# Count unique words after removing stopwords
no_stopwords_unique_words = remove_stopwords(unique_words)
unique_words_after_stopwords = count_unique_words(no_stopwords_unique_words)

#Compute reduction rate in % after removing stopwords
reduction_rate= ((len(unique_words) - unique_words_after_stopwords) / len(unique_words)) * 100

#Count unique words after stemming
after_stemming = stem_tokens(no_stopwords_unique_words)
unique_words_after_stemming = count_unique_words(after_stemming)

#Compute reduction rate in % after stemming the amount of words after removed stopwords
reduction_Rate = ((len(after_stemming) - unique_words_after_stemming) / len(after_stemming)) * 100

print("### Unique words in the data before and after preprocessing ###")
print( ) 
print(f"Unique words before preprocessing:            {len(unique_words)}")
print( ) 
print(f"Unique words after removing stopwords:        {unique_words_after_stopwords}")
print( )
print(f"Reduction rate in % after removing stopwords: {reduction_rate:.2f}%")
print( )
print(f"Unique words after stemming:                  {unique_words_after_stemming}")
print( )
print(f"Reduction rate in % after stemming:           {reduction_Rate:.2f}%")


### Unique words in the data before and after preprocessing ###

Unique words before preprocessing:            19293

Unique words after removing stopwords:        17737

Reduction rate in % after removing stopwords: 8.07%

Unique words after stemming:                  12632

Reduction rate in % after stemming:           34.03%


### Task #2

- Describe how you ended up representing the FakeNewsCorpus dataset (for instance with a Pandas dataframe). Argue for why you chose this design.
- Did you discover any inherent problems with the data while working with it?
- Report key properties of the data set - for instance through statistics or visualization.

The exploration can include (but need not be limited to):

- counting the number of URLs in the content
- counting the number of dates in the content
- counting the number of numeric values in the content
- determining the 100 more frequent words that appear in the content
- plot the frequency of the 10000 most frequent words (any interesting patterns?)
- run the analysis in point 4 and 5 both before and after removing stopwords and applying stemming: do you see any difference?

In [470]:
### Code ###
#Find all the columns
import csv
import urllib.request

url = "https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv"

#Read CSV file from the url and parse it into a list of dictionaries
with urllib.request.urlopen(url) as response:
    data = [row for row in csv.DictReader(response.read().decode("utf-8").splitlines())]
    
print("Column Names: ", list(data[0].keys()))

Column Names:  ['', 'id', 'domain', 'type', 'url', 'content', 'scraped_at', 'inserted_at', 'updated_at', 'title', 'authors', 'keywords', 'meta_keywords', 'meta_description', 'tags', 'summary']


In [471]:
import csv
import io
import requests
import re

url = "https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv"
response = requests.get(url)
content = response.content.decode("utf-8")

def count_tokens(rows):
    num_count = 0
    url_count = 0
    date_count = 0
    for row in rows:
        content = row['content']
        num_count += content.count("NUM") #count number of "<NUM>" in column 'content'
        date_count += content.count("DATE") #count number of "<DATE>" in column 'content'
        url_count += content.count("URL") #count number of "<URL>" in column 'content'
    return num_count, date_count, url_count

rows = []
for line in csv.DictReader(io.StringIO(content)):
    line['content'] = clean_text_and_tokenize(line['content'])
    rows.append(line)

num_count, date_count, url_count = count_tokens(rows)

print(f"Number of <NUM> tokens: {num_count}")
print(f"Number of <DATE> tokens: {date_count}")
print(f"Number of <URL> tokens: {url_count}")

Number of <NUM> tokens: 2487
Number of <DATE> tokens: 40
Number of <URL> tokens: 329


### Task #3

Apply your data preprocessing pipeline to a larger proportion of the FakeNewsCorpus https://github.com/several27/FakeNewsCorpus/releases/tag/v1.0

You may find it challenging to run your data processing pipeline on the entire FakeNewsCorpus. At a minimum, you should be able to process 10% of the data using your pipeline,

In [472]:
import csv

def process_text(text):
    if text is None:
        return ''
    tokens = clean_text_and_tokenize(text)
    tokens = remove_stopwords(tokens)
    stemmed_tokens = stem_tokens(tokens)
    return ' '.join(stemmed_tokens)

columns_to_process = {'content', 'type', 'meta_description', 'domain', 'title', 'meta_keyboards'}
with open('news_cleaned_2018_02_13.csv', encoding="utf-8") as f:
    reader = csv.DictReader(f)

    with open('news_cleaned_2018_02_13-results3.csv', 'w', encoding="utf-8") as fOut:
        fieldnames = reader.fieldnames + ["processed_text"]
        writer = csv.DictWriter(fOut, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            processed_row = {column_name: (process_text(cell_value) if column_name in columns_to_process else cell_value) for column_name, cell_value in row.items()}
            writer.writerow(processed_row)


FileNotFoundError: [Errno 2] No such file or directory: 'news_cleaned_2018_02_13.csv'

### Task #4

Split the resulting dataset into a training, validation, and test splits. A common strategy is to uniformly at random split the data 80% / 10% / 10%. You will use the training data to train your baseline and advanced models, the validation data can be used for model selection and hyperparameter tuning, while the test data should only be used in Part 4.

In [None]:
import dask.dataframe as dd
import dask_ml.model_selection
import numpy as np

In [None]:
# Read the CSV file into a Dask DataFrame with specified data types
cleaned_data = dd.read_csv('news_cleaned_2018_02_13-results.csv', encoding="utf-8", dtype={
        'Unnamed: 0': 'object',
        'id': 'object',
        'domain': 'object',
        'type': 'object',
        'url': 'object',
        'content': 'object',
        'scraped_at': 'object',
        'inserted_at': 'object',
        'updated_at': 'object',
        'title': 'object',
        'authors': 'object',
        'keywords': 'float64',
        'meta_keywords': 'object',
        'meta_description': 'object',
        'tags': 'object',
        'summary': 'float64',
        'tokens': 'object',
        'filtered_tokens': 'object',
        'stemmed_tokens': 'object',
    },)

In [None]:
# Define a function to modify the 'type' column values
def modify_type(x):
    if x == 'reliabl':
        return 'reliabl'
    else:
        return 'fake'

In [None]:
# Apply the modify_type function to the 'type' column
cleaned_data['type'] = cleaned_data['type'].map(modify_type, meta=('type', 'object'))

In [None]:
def nan_to_empty(x):
    if isinstance(x, float) and np.isnan(x):
        return ''
    else:
        return x

In [None]:
cleaned_data['content'] = cleaned_data['content'].map(nan_to_empty, meta=('content', 'object'))

In [None]:
# Access the 'type' column
y = cleaned_data['type']
X = cleaned_data['content']

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = dask_ml.model_selection.train_test_split(X, y, test_size=0.2, random_state=0, shuffle=False)
X_train, X_val, y_train, y_val = dask_ml.model_selection.train_test_split(X_train, y_train, test_size=0.5, random_state=0, shuffle=False)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(X_train)
xv_test = vectorization.transform(X_test) 

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression() 
model.fit(xv_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [None]:
model.score(xv_test,y_test)

0.9947407563320003

In [None]:
pred_model = model.predict(xv_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test,pred_model))

              precision    recall  f1-score   support

        fake       0.99      1.00      1.00    707312
     reliabl       0.90      0.33      0.49      5338

    accuracy                           0.99    712650
   macro avg       0.95      0.67      0.74    712650
weighted avg       0.99      0.99      0.99    712650

