In [5]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

kaggle_dataset = '/content/drive/MyDrive/deeplearningProgression/kaggle_dataset.csv'
kaggle_dataset = pd.read_csv(kaggle_dataset)

huggingFace_dataset = '/content/drive/MyDrive/deeplearningProgression/huggingface_dataset_fixed.csv'
huggingFace_dataset = pd.read_csv(huggingFace_dataset)


print(kaggle_dataset.head())
print(huggingFace_dataset.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                                               title  \
0  law enforcement high alert following threat co...   
1                                            missing   
2  unbelievable obamas attorney general say charl...   
3  bobby jindal raised hindu us story christian c...   
4  satan 2 russia unvelis image terrifying new su...   

                                                text  label  title_missing  \
0  comment expected barack obama member fyf911 fu...      1          False   
1                          post vote hillary already      1           True   
2  demonstrator gathered last night exercising co...      1          False   
3  dozen politically active pastor came private d...      0          False   
4  rs28 sarmat missile dubbed satan 2 replace ss1...      1          False   

   text_missing                                       title_token

In [6]:
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Download necessary NLTK models and corpora
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 1. Remove any unnamed columns (if present)
huggingFace_dataset = huggingFace_dataset.loc[:, ~huggingFace_dataset.columns.str.contains('^Unnamed')]

# 2. Handle missing values by creating new columns and replacing missing values with "missing"
huggingFace_dataset['news_missing'] = huggingFace_dataset['news'].isna() | huggingFace_dataset['news'].eq('')
huggingFace_dataset['label_missing'] = huggingFace_dataset['label'].isna() | huggingFace_dataset['label'].eq('')

huggingFace_dataset['news'] = huggingFace_dataset['news'].replace('', 'missing')
huggingFace_dataset['news'] = huggingFace_dataset['news'].fillna('missing')

# 3. Convert all text to lowercase
huggingFace_dataset['news'] = huggingFace_dataset['news'].str.lower()

# 4. Remove punctuation
def remove_punctuation(text):
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text

huggingFace_dataset['news'] = huggingFace_dataset['news'].apply(remove_punctuation)

# 5. Remove stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    if isinstance(text, str):
        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
        return ' '.join(filtered_text)
    return text

huggingFace_dataset['news'] = huggingFace_dataset['news'].apply(remove_stopwords)

# 6. Tokenization
def tokenize_text(text):
    return [word for word in word_tokenize(text) if word.isalnum()]

huggingFace_dataset['news_tokens'] = huggingFace_dataset['news'].apply(tokenize_text)

# 7. Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    if isinstance(text, str):
        word_tokens = word_tokenize(text)
        lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in word_tokens if word.lower() not in stop_words]
        return ' '.join(lemmatized_tokens)
    return text

huggingFace_dataset['news'] = huggingFace_dataset['news'].apply(lemmatize_text)

# Save the processed dataset to a new CSV file
huggingFace_dataset.to_csv('huggingface_dataset_processed.csv', index=False)

# Check the quality of the cleaned dataset
print(huggingFace_dataset.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                news  label  news_missing  \
0  pennsylvania court order count ballot election...      0         False   
1  biden democrat dismantled border security neva...      0         False   
2  katie hobbs voted double gas tax runup midterm...      1         False   
3  reuters reported nancy pelosi bought 10 millio...      0         False   
4  true united state built stolen land florida go...      0         False   

   label_missing                                        news_tokens  
0          False  [pennsylvania, court, order, count, ballots, e...  
1          False  [biden, democrats, dismantled, border, securit...  
2          False  [katie, hobbs, voted, double, gas, tax, runup,...  
3          False  [reuters, reported, nancy, pelosi, bought, 10,...  
4          False  [true, united, states, built, stolen, land, fl...  


In [7]:
huggingFace_dataset.to_csv('huggingface_dataset.csv', index=False)