In [1]:
import pandas as pd
import spacy
import pickle
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('../data/news.csv')

In [3]:
# Remove the 'date' column from the DataFrame
df = df.drop('date', axis=1)

In [4]:
# Select rows with duplicate values in the 'text' column, keeping all occurrences
duplicated = df[df.duplicated(subset=['text'], keep=False)]

In [5]:
# Remove duplicate rows based on the 'text' column
df = df.drop_duplicates(subset=['text'])

In [6]:
# Remove rows with any missing values
df = df.dropna(how='any')

In [7]:
# Load the English language model from spaCy
nlp = spacy.load('en_core_web_sm')

In [8]:
# Define a function to process text using spaCy
def process_text(text):
    # Tokenize the text using spaCy
    doc = nlp(text)
    # Extract lemmatized tokens, excluding punctuation tokens
    return [token.lemma_ for token in doc if not token.is_punct]

In [9]:
# Process the text in the DataFrame column 'text' using the defined function
processed_corpus = [process_text(text) for text in df['text']]

In [10]:
# Encode the labels using LabelEncoder
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(df['label'])

In [11]:
# Save the processed corpus as a pickle file
with open('../data/processed_corpus.pkl', 'wb') as f:
    pickle.dump(processed_corpus, f)

In [12]:
# Save the encoded labels as a pickle file
with open('../data/labels.pkl', 'wb') as f:
    pickle.dump(labels_encoded, f)