In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
# This line imports the TfidfVectorizer class from the sklearn.feature_extraction.text module. TfidfVectorizer is used to convert a collection of raw documents into a matrix of TF-IDF features.

# This line imports the LogisticRegression class from the sklearn.linear_model module
from sklearn.linear_model import LogisticRegression

# This line imports the pandas library and gives it the alias pd. pandas is a powerful data manipulation library in Python, commonly used for working with structured data.
import pandas as pd

# This line imports the pickle module, which is used for serializing and deserializing Python objects. It allows us to save Python objects (such as models) to files and load them back into memory later.
import pickle

# Load the dataset
# This line reads a CSV file named 'True.csv' into a pandas DataFrame named df. This CSV file likely contains labeled data, such as news headlines and their corresponding labels (e.g., 'REAL' or 'FAKE').
df = pd.read_csv('True.csv')

# Preprocess the data
# TfidfVectorizer(stop_words='english'): This line initializes a TfidfVectorizer object with English stop words removed. Stop words are common words (e.g., 'the', 'is', 'and') that are often removed from text data because they carry little semantic meaning.
vectorizer = TfidfVectorizer(stop_words='english')

# vectorizer.fit_transform(df['title']): This line uses the fit_transform method of the TfidfVectorizer object to transform the text data in the 'title' column of the DataFrame df into a TF-IDF matrix X. Each row of X represents a document, and each column represents a unique word in the corpus.
X = vectorizer.fit_transform(df['title'])

# df['label']: This line extracts the labels from the 'label' column of the DataFrame df and assigns them to the variable y. These labels will be used as the target variable for training the classification model.
y = df['label']

# Train the model
# This line initializes a logistic regression model object.
model = LogisticRegression()


# model.fit(X, y): This line trains the logistic regression model on the features X (TF-IDF matrix) and the target variable y (labels).
model.fit(X, y)

# Save the model to a file
# This line opens a file named 'model.pkl' in write-binary mode. The file will be used to store the trained logistic regression model.
with open('model.pkl', 'wb') as f:
    
# This line serializes (pickles) the trained logistic regression model model and writes it to the file f.
    pickle.dump(model, f)

# Save the vectorizer to a file
# This line opens a file named 'vectorizer.pkl' in write-binary mode. The file will be used to store the trained TF-IDF vectorizer.
with open('vectorizer.pkl', 'wb') as f:
# This line serializes (pickles) the trained TF-IDF vectorizer vectorizer and writes it to the file f.
    pickle.dump(vectorizer, f)
