# Fake News Classifier model

A model built using natural language processing for fake news classification

##### Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import nltk 
from nltk import word_tokenize
from nltk.corpus import stopwords

In [None]:
df = pd.read_csv('fake_news_train.csv')
df.head()

In [None]:
df = df.fillna("", axis=1)
print(df.isnull().sum())

##### Data Tokenization

In [None]:
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

df['tokenized_text'] = df['text'].apply(tokenize_text)
print(df['tokenized_text'])


In [None]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

df['tokenized_text_stop'] = df['tokenized_text'].apply(remove_stopwords)
print(df['tokenized_text_stop'])

##### Data Vectorization

In [None]:
X_df = df['tokenized_text_stop']
y_df = df['label']
y_df = y_df.astype(int)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)
tfidf = tfidf_vectorizer.fit_transform(X_df)

#### Model training and testing

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf, y_df, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train) 

In [None]:
y_pred = logreg.predict(X_test)

#### Model Results

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [None]:
print("The accuracy score is ", accuracy)
print("The precision score is ", precision)
print("The recall score is ", recall)