In [28]:
import pandas as pd
import numpy as np
import re
import nltk
import string
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [3]:
df = pd.read_csv("WELFake_Dataset.csv")

In [5]:
df = df.drop(["Unnamed: 0"], axis=1)

In [7]:
df.isnull().sum()

title    558
text      39
label      0
dtype: int64

In [14]:
df = df.dropna(how="any")

In [15]:
df.shape

(71537, 3)

In [12]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    words = [word for word in words if len(word.strip()) > 1]
    text = " ".join(words)
    return text

In [16]:
df["text"] = df["text"].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title"] = df["title"].apply(preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df["text"].apply(preprocess)


In [18]:
lemmatizer = WordNetLemmatizer()
def lemmatizing(text):
    stemmed_text = ""
    for word in text.split():
        stem = lemmatizer.lemmatize(word)
        stemmed_text += stem
        stemmed_text += " "
        
    stemmed_text = stemmed_text.strip()
    return stemmed_text

In [19]:
df["text"] = df["text"].apply(lemmatizing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title"] = df["title"].apply(lemmatizing)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df["text"].apply(lemmatizing)


In [20]:
stemmer = PorterStemmer()
def stemming(text):
    stemmed_text = ""
    for word in text.split():
        stem = stemmer.stem(word)
        stemmed_text += stem
        stemmed_text += " "
        
    stemmed_text = stemmed_text.strip()
    return stemmed_text

In [21]:
df["text"] = df["text"].apply(stemming)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title"] = df["title"].apply(stemming)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df["text"].apply(stemming)


In [22]:
X = df["text"]
y = df["label"]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4242)

In [24]:
tfidf_vect = TfidfVectorizer()
tfidf_train = tfidf_vect.fit_transform(X_train)
tfidf_test = tfidf_vect.transform(X_test)

In [26]:
count_vectorizer = CountVectorizer()
cv_train = count_vectorizer.fit_transform(X_train.values)
cv_test = count_vectorizer.transform(X_test.values)

In [30]:
multinomial_model = MultinomialNB()
multinomial_model.fit(tfidf_train, y_train)
y_pred = multinomial_model.predict(tfidf_test)
print("MultinomialNB (TFIDF):", accuracy_score(y_test, y_pred))

MultinomialNB (TFIDF): 0.867812878576088


In [32]:
multinomial_model2 = MultinomialNB()
multinomial_model2.fit(cv_train, y_train)
y_pred = multinomial_model2.predict(cv_test)
print("MultinomialNB (CountVectorizer):", accuracy_score(y_test, y_pred))

MultinomialNB (CountVectorizer): 0.8976796197931227


In [33]:
rf_model = RandomForestClassifier()
rf_model.fit(tfidf_train, y_train)
y_pred = rf_model.predict(tfidf_test)
print("Random Forest (TFIDF):", accuracy_score(y_test, y_pred))

Random Forest (TFIDF): 0.924937098126922


In [34]:
rf_model2 = RandomForestClassifier()
rf_model2.fit(cv_train, y_train)
y_pred = rf_model2.predict(cv_test)
print("Random Forest (CountVectorizer):", accuracy_score(y_test, y_pred))

Random Forest (CountVectorizer): 0.9228403690243221


In [None]:
knn_model = KNeighborsClassifier()
knn_model.fit(tfidf_train, y_train)
y_pred = knn_model.predict(tfidf_test)
print("KNN (TFIDF):", accuracy_score(y_test, y_pred))

In [None]:
knn_model2 = KNeighborsClassifier()
knn_model2.fit(cv_train, y_train)
y_pred = knn_model2.predict(cv_test)
print("KNN (CountVectorizer):", accuracy_score(y_test, y_pred))

In [None]:
pickle.dump(rf_model2, open("RF.pkl", "wb"))
pickle.dump(tfidf_vect, open("TFIDF.pkl", "wb"))
pickle.dump(count_vectorizer, open("CountVectorizer.pkl", "wb"))