In [312]:
# Import required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\willi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [313]:
def preprocess_text(text):

     # TO LOWER CASE
    text = text.lower()

    # REMOVE EMOJIS
    def remove_emoji(string):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', string)

    # REMOVE URLs
    def remove_URLs(text):
        first = re.sub(r"http\S+", "", text)
        return re.sub(r"\\\/\S+", "", first)

    # REMOVE USERNAME MENTIONS
    def remove_Mentions(text):
        return re.sub(r"@\w+","", text)

    # REMOVE STOP WORDS
    def remove_stopwords(text):
        stop_words = set(stopwords.words('english'))
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        filtered_text = ' '.join(filtered_words)
        return filtered_text


    # REMOVE SPECIAL CHARACTERS
    def clean_text(text):
        return re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    def stem_words(text):
        
        porter = PorterStemmer()
        words = word_tokenize(text)
        stemmed_words = [porter.stem(word) for word in words]

        return ' '.join(stemmed_words)
    
    def lemmatize_words(text):
        tokens = word_tokenize(text)
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(lemmatized_words)
    
    text = remove_emoji(text)
    text = remove_URLs(text)
    text = remove_Mentions(text)
    text = remove_stopwords(text)
    text = clean_text(text)
    text = stem_words(text)
    text = lemmatize_words(text)
    return text


In [314]:
def getMetrics(y_true, y_pred):
    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred, labels=['fake','real'])
    TP, FP, FN, TN = conf_matrix.ravel()
    print("Confusion Matrix:")
    print(conf_matrix)
    
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = 2 * ((precision * recall) / (precision + recall))
    
    print("TP: %d FP: %d TN: %d FN: %d" % (TP, FP, TN, FN))
    print("Precision: %0.3f" % precision)
    print("Recall: %0.3f" % recall)
    print("F1 Score: %0.3f" % f1)

In [315]:
#Load training and test data
train_df = pd.read_csv("dataset/mediaeval-2015-trainingset.txt", sep="\t")
test_df = pd.read_csv("dataset/mediaeval-2015-testset.txt", sep="\t")

train_df = pd.DataFrame(data=train_df)
test_df = pd.DataFrame(data=test_df)

# print(train_df.head(10))


In [316]:
# DATA ANALYSIS



In [317]:
#Pre-Processing

# Replace 'humor' label with 'fake' -------------------------------------------------------------------
train_df['label'] = train_df['label'].replace('humor', 'fake')
test_df['label'] = test_df['label'].replace('humor', 'fake')

# # REMOVE DUPLICATES | MAYBE DONT REMOVE DUPLICATES AS IT RESEMBLES REALISM -------------------------------------------------------------------
# train_data = train_data.drop_duplicates(subset=['tweetText'])

train_df['tweetText'] = train_df['tweetText'].apply(preprocess_text)
test_df['tweetText'] = test_df['tweetText'].apply(preprocess_text)

pd.set_option('display.max_colwidth', None)
print(train_df.head(10))


              tweetId  \
0  263046056240115712   
1  262995061304852481   
2  262979898002534400   
3  262996108400271360   
4  263018881839411200   
5  263364439582060545   
6  262927032705490944   
7  263321078884077568   
8  263111677485142017   
9  262977091983785985   

                                                                                         tweetText  \
0  se acuerdan de la pelcula el da despus de maana recuerda lo que est pasando con el huracn sandy   
1          miren sandy en ny tremenda imagen del huracn parece el da de la independencia 2 real rt   
2              buena la foto del huracn sandy recuerda la pelcula da de la independencia id4 sandy   
3                                                                          scary shit hurricane ny   
4                                             fave place world nyc hurricane sandy statueofliberty   
5                                                            42nd time square nyc subway hurricane   
6         

In [318]:
y_train = train_df['label']
y_test = test_df['label']

# Feature extraction using TF-IDF
tfidfVectorizer = TfidfVectorizer()
X_train_tfidf = tfidfVectorizer.fit_transform(train_df['tweetText'])
X_test_tfidf = tfidfVectorizer.transform(test_df['tweetText'])

In [319]:
# # Create a Random Forest classifier
# rfc_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)

# # Train the Random Forest classifier
# rfc_tfidf.fit(X_train_tfidf, y_train)

# # Make predictions on the test set
# rfcPrediction_tfid = rfc_tfidf.predict(X_test_tfidf)

# getMetrics(y_test, rfcPrediction_tfid)

Confusion Matrix:
[[2044  502]
 [ 330  879]]
TP: 2044 FP: 502 TN: 879 FN: 330
Precision: 0.803
Recall: 0.861
F1 Score: 0.831


In [320]:
# # Apply PCA for dimensionality reduction
# n_components = 1000  # Number of components to reduce to (adjust as needed)
# pca = PCA(n_components=n_components)
# X_train_reduced = pca.fit_transform(X_train_count.toarray())  # Fit PCA on training data
# X_test_reduced = pca.transform(X_test_count.toarray())  # Transform test data using the same PCA

# # Display the reduced dimensions
# print("Original shape:", X_train_count.shape)
# print("Reduced shape:", X_train_reduced.shape)

In [321]:
ngramVectorizer = CountVectorizer(ngram_range=(1,2))
X_train_ngram = ngramVectorizer.fit_transform(train_df['tweetText'])
X_test_ngram = ngramVectorizer.transform(test_df['tweetText'])

In [322]:
# Create a Random Forest classifier
rfc_ngram = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest classifier
rfc_ngram.fit(X_train_ngram, y_train)
 
# Make predictions on the test set
rfcPrediction_ngram = rfc_ngram.predict(X_test_ngram)

getMetrics(y_test, rfcPrediction_ngram)

Confusion Matrix:
[[2312  234]
 [ 374  835]]
TP: 2312 FP: 234 TN: 835 FN: 374
Precision: 0.908
Recall: 0.861
F1 Score: 0.884


In [323]:
# Create a Naive Bayes classifier
nbc_tfidf = KNeighborsClassifier(n_neighbors=5)

# Train the Naive Bayes classifier
nbc_tfidf.fit(X_train_tfidf, y_train)

# Make predictions on the test set
nbcPrediction_tfidf = nbc_tfidf.predict(X_test_tfidf)

getMetrics(y_test, nbcPrediction_tfidf)

Confusion Matrix:
[[2111  435]
 [ 324  885]]
TP: 2111 FP: 435 TN: 885 FN: 324
Precision: 0.829
Recall: 0.867
F1 Score: 0.848
