In [18]:
import string
import re 
import nltk #NLP 
from nltk.tokenize import word_tokenize #Tokenization
from nltk.corpus import stopwords #Remove common words
import numpy as np #math
import pandas as pd #Read the dataset
from googletrans import Translator #Google translator API

In [19]:
#DATA PREPROCESSING STEP

In [20]:
#Read the csv file (โหลด Fake news dataset ละเปลื่ยน path )
FN_dataset = pd.read_csv(f'C:/Users/TKZ IT zone/Downloads/archive/WELFake_Dataset.csv')  
FN_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [21]:
#Remove Unnamed column as it is not necessary
FN_dataset = FN_dataset.drop(columns=['Unnamed: 0'])
FN_dataset.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [22]:
#Update new dataset
FN_dataset.to_csv(f'C:/Users/TKZ IT zone/Downloads/archive/WELFake_Dataset_edit1.csv', index=False) 

In [23]:
#Title and text columns are independet variables and label is not.
#Extract only the independent column
independent= FN_dataset.iloc[:,:-1].values  
#Extract only dependent columns
dependent= FN_dataset.iloc[:,2].values

In [24]:
#Show independent variable array list
print("Independent variables (Title and Text columns):")
print(independent[:1]) 


Independent variables (Title and Text columns):
[['LAW ENFORCEMENT ON HIGH ALERT Following Threats Against Cops And Whites On 9-11By #BlackLivesMatter And #FYF911 Terrorists [VIDEO]'
  'No comment is expected from Barack Obama Members of the #FYF911 or #FukYoFlag and #BlackLivesMatter movements called for the lynching and hanging of white people and cops. They encouraged others on a radio show Tuesday night to  turn the tide  and kill white people and cops to send a message about the killing of black people in America.One of the F***YoFlag organizers is called  Sunshine.  She has a radio blog show hosted from Texas called,  Sunshine s F***ing Opinion Radio Show. A snapshot of her #FYF911 @LOLatWhiteFear Twitter page at 9:53 p.m. shows that she was urging supporters to  Call now!! #fyf911 tonight we continue to dismantle the illusion of white Below is a SNAPSHOT Twitter Radio Call Invite   #FYF911The radio show aired at 10:00 p.m. eastern standard time.During the show, callers clearly c

In [25]:
#Show dependent variable array list.
print("\nDependent variable (Label):")
print(dependent[:5])  


Dependent variable (Label):
[1 1 1 0 1]


In [29]:
#Cleaning data
#1.Remove all null values with "missing" value and create 2 new columns to extract missing values
#From Title and Text columns. In this way the accuracy of the model won't be biased by "missing" word.
#ลบค่า NULL VALUE ละแทนค่าด้วย MISSING หลังจากนั้นสร้าง COLUMN เพิ่มขึ้นมาเพื่อเช็คจะได้ไม่กระทบกับ MODEL ACCURACY
FN_dataset['title_missing'] = FN_dataset['title'].isna()
FN_dataset['text_missing'] = FN_dataset['text'].isna()
#Replace missing values with a placeholder
FN_dataset['title'].fillna('missing', inplace=True)
FN_dataset['text'].fillna('missing', inplace=True)

#2.Convert all text to lowercase for consistency เปลื่ยนอักษรให้เป็วตัวเล็ก
FN_dataset['title'] = FN_dataset['title'].str.lower()

#3.Removing punctuation and special characters. ลบตัวอักษรพิเศษ
def remove_punctuation(text):
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text
FN_dataset['title'] = FN_dataset['title'].apply(remove_punctuation)
FN_dataset['text'] = FN_dataset['text'].apply(remove_punctuation)


#Apply the changes 
FN_dataset.to_csv(f'C:/Users/TKZ IT zone/Downloads/archive/WELFake_Dataset_edit2.csv', index=False)






In [37]:
#4.Tokenization ทำให้ตัวอักษรเป็นคำ
#Download the NLTK tokenizer models 
nltk.download('punkt')
FN_dataset = pd.read_csv(f'C:/Users/TKZ IT zone/Downloads/archive/WELFake_Dataset_edit2.csv')

FN_dataset['title'] = FN_dataset['title'].fillna('').astype(str)
FN_dataset['text'] = FN_dataset['text'].fillna('').astype(str)

def tokenize_text(text):
    return [word for word in word_tokenize(text) if word.isalnum()]

#Apply tokenization to 'title' and 'text' columns
FN_dataset['title_tokens'] = FN_dataset['title'].apply(tokenize_text)
FN_dataset['text_tokens'] = FN_dataset['text'].apply(tokenize_text)

#Apply the changes 
FN_dataset.to_csv(f'C:/Users/TKZ IT zone/Downloads/archive/WELFake_Dataset_edit3.csv', index=False)

[nltk_data] Downloading package punkt to C:\Users\TKZ IT
[nltk_data]     zone\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
#Check the quality of the cleaned dataset
FN_dataset = pd.read_csv(f'C:/Users/TKZ IT zone/Downloads/archive/WELFake_Dataset_edit3.csv')
print(FN_dataset.columns)
print(FN_dataset.head())

Index(['title', 'text', 'label', 'title_missing', 'text_missing',
       'title_tokens', 'text_tokens'],
      dtype='object')
                                               title  \
0  law enforcement on high alert following threat...   
1                                            missing   
2  unbelievable obamas attorney general says most...   
3  bobby jindal raised hindu uses story of christ...   
4  satan 2 russia unvelis an image of its terrify...   

                                                text  label  title_missing  \
0  no comment is expected from barack obama membe...      1          False   
1      did they post their votes for hillary already      1          False   
2   now most of the demonstrators gathered last n...      1          False   
3  a dozen politically active pastors came here f...      0          False   
4  the rs28 sarmat missile dubbed satan 2 will re...      1          False   

   text_missing                                       title_tokens 