In [None]:
import pandas as pd
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Downloading necessary NLTK resources
nltk.download('punkt')       # Tokenizer Model
nltk.download('stopwords')   # Stopwords
nltk.download('wordnet')     # Lemmatizer

# Get the current working directory
script_dir = os.getcwd()

# Loading the CSV files from the current working directory
file_path1 = os.path.join(script_dir, 'True.csv')
file_path2 = os.path.join(script_dir, 'Fake.csv')

# Loading the CSV files with specified data types
data1 = pd.read_csv(file_path1, dtype=str)
data2 = pd.read_csv(file_path2, dtype=str)

pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 2)
#print(data1.head(1))
#print(data2.head(1))

# Concatenate the two dataframes
data = pd.concat([data1, data2], ignore_index=True)
output_file_path = os.path.join(script_dir, 'News_Dataset.csv')
data.to_csv(output_file_path, index=False)
#print(data1.columns)
#print(data2.columns)
# Specify the names of the five columns
#specified_columns = ['title', 'text', 'subject', 'date', 'type']

# Filter the DataFrame to include rows where all specified columns have data
#filtered_data = data.dropna(subset=specified_columns, how='any')

In [None]:
# Initialize NLTK's WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Cleans and preprocesses the input text.
    
    Parameters:
    text (str): The text to preprocess.
    
    Returns:
    str: The cleaned and preprocessed text.
    """
    # Tokenize the text into words
    tokens = word_tokenize(text)

    # Convert tokens to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove stopwords (common words that do not carry much meaning)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove punctuation and special characters
    tokens = [token for token in tokens if token not in string.punctuation]

    # Lemmatize tokens to their base form
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove tokens that are not alphabetic
    tokens = [token for token in tokens if token.isalpha()]

    # Join tokens back into a single string
    clean_text = ' '.join(tokens)

    return clean_text

data['Tokenized Text'] = data['text'].apply(preprocess_text)
# pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', 2)
print(data['Tokenized Text'].head(1))

# Define the output file path for the tokenized data
tokenized_output_file_path = tokenized_data.csv

# Save the tokenized data to a CSV file
data.to_csv(tokenized_output_file_path, index=False)


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training (70%) and test (30%) sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Define the output file paths for the split datasets
train_output_file_path = train_data.csv
test_output_file_path = test_data.csv

# Save the split datasets to CSV files
train_data.to_csv(train_output_file_path, index=False)
test_data.to_csv(test_output_file_path, index=False)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training (70%) and test (30%) sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Define the output file paths for the split datasets
train_output_file_path = train_data.csv
test_output_file_path = test_data.csv

# Save the split datasets to CSV files
train_data.to_csv(train_output_file_path, index=False)
test_data.to_csv(test_output_file_path, index=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, f1_score, log_loss
import matplotlib.pyplot as plt

# Load the data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

# Replace missing values with empty strings
train_data['Tokenized Text'].fillna('', inplace=True)
test_data['Tokenized Text'].fillna('', inplace=True)

# Separate features (X) and target (y)
X_train, y_train = train_data['Tokenized Text'], train_data['type']
X_test, y_test = test_data['Tokenized Text'], test_data['type']

# Define models
models = [
    ('Naive Bayes', make_pipeline(TfidfVectorizer(), MultinomialNB())),
    ('Logistic Regression', make_pipeline(TfidfVectorizer(), LogisticRegression())),
    ('Random Forest', make_pipeline(TfidfVectorizer(), RandomForestClassifier())),
    ('Gradient Boosting', make_pipeline(TfidfVectorizer(), GradientBoostingClassifier())),
    #('Support Vector Machine', make_pipeline(TfidfVectorizer(), SVC(probability=True)))
]

# Train models and evaluate
results = {'Model': [], 'Test Accuracy': [], 'Test Precision': [], 'Test F1': [], 'Test Loss': []}

for name, model in models:
    print("Training", name, "model...")
    model.fit(X_train, y_train)
    
    # Test set evaluation
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, average='weighted')
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    test_loss = log_loss(y_test, model.predict_proba(X_test))
    
    # Save results
    results['Model'].append(name)
    results['Test Accuracy'].append(test_accuracy)
    results['Test Precision'].append(test_precision)
    results['Test F1'].append(test_f1)
    results['Test Loss'].append(test_loss)
    print(name, "model trained and evaluated successfully.")

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Plot accuracy, precision, and F1 scores
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.bar(results_df['Model'], results_df['Test Accuracy'], color='skyblue')
plt.title('Test Accuracy Comparison')
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
plt.bar(results_df['Model'], results_df['Test Loss'], color='salmon')
plt.title('Test Log Loss Comparison')
plt.ylabel('Log Loss')
plt.xlabel('Model')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

# Handle missing values by replacing NaN with an empty string
train_data['Tokenized Text'].fillna('', inplace=True)
test_data['Tokenized Text'].fillna('', inplace=True)

# Separate features (X) and target (y)
X_train, y_train = train_data['Tokenized Text'], train_data['type']
X_test, y_test = test_data['Tokenized Text'], test_data['type']

# Define the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform text data into TF-IDF features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Define and train the MLP Classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,))
mlp_classifier.fit(X_train_tfidf, y_train)

# Evaluate the MLP Classifier
y_test_pred = mlp_classifier.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Testing Accuracy (Neural Network):", test_accuracy)

# Visualize the results
results_df = pd.DataFrame({
    'Model': ['Neural Network'],
    'Testing Accuracy': [test_accuracy]
})

plt.figure(figsize=(6, 4))
sns.barplot(x='Model', y='Testing Accuracy', data=results_df, color='orange')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt

# Load the data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

# Replace missing values with empty strings
train_data['Tokenized Text'].fillna('', inplace=True)
test_data['Tokenized Text'].fillna('', inplace=True)

# Separate features (X) and target (y)
X_train, y_train = train_data['Tokenized Text'], train_data['type']
X_test, y_test = test_data['Tokenized Text'], test_data['type']

# Define models
models = [
    ('Naive Bayes', make_pipeline(TfidfVectorizer(), MultinomialNB())),
    ('Logistic Regression', make_pipeline(TfidfVectorizer(), LogisticRegression())),
    ('Random Forest', make_pipeline(TfidfVectorizer(), RandomForestClassifier())),
    ('Gradient Boosting', make_pipeline(TfidfVectorizer(), GradientBoostingClassifier()))
]

# Train models and evaluate
results = {'Model': [], 'Test Accuracy': [], 'Test Precision': [], 'Test Recall': [], 'Test F1': []}

for name, model in models:
    print("Training", name, "model...")
    model.fit(X_train, y_train)
    
    # Test set evaluation
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, average='weighted')
    test_recall = recall_score(y_test, y_test_pred, average='weighted')
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    
    # Confusion Matrix
    confusion_mat = confusion_matrix(y_test, y_test_pred)
    
    # Save results
    results['Model'].append(name)
    results['Test Accuracy'].append(test_accuracy)
    results['Test Precision'].append(test_precision)
    results['Test Recall'].append(test_recall)
    results['Test F1'].append(test_f1)
    print(name, "model trained and evaluated successfully.")

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print results
print(results_df)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

# Load the datasets
train_data = pd.read_csv(train_output_file_path)
test_data = pd.read_csv(test_output_file_path)

# Encode the target column
label_encoder = LabelEncoder()
train_data['type'] = label_encoder.fit_transform(train_data['type'])
test_data['type'] = label_encoder.transform(test_data['type'])

# Fill missing values in the text data
train_data['Tokenized Text'].fillna('', inplace=True)
test_data['Tokenized Text'].fillna('', inplace=True)

# Define features (X) and target (y)
X_train = train_data['Tokenized Text']
y_train = train_data['type']
X_test = test_data['Tokenized Text']
y_test = test_data['type']

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data into TF-IDF vectors for training and testing sets
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train SVM model with Stochastic Gradient Descent
svm_model = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, random_state=42, max_iter=1000, tol=1e-3)

# Collect loss values during training for both training and testing datasets
train_losses = []
test_losses = []
n_iterations = 1000  # Number of iterations for training

for i in range(n_iterations):
    svm_model.partial_fit(X_train_tfidf, y_train, classes=np.unique(y_train))
    train_loss = np.mean(np.maximum(0, 1 - y_train * svm_model.decision_function(X_train_tfidf)))
    test_loss = np.mean(np.maximum(0, 1 - y_test * svm_model.decision_function(X_test_tfidf)))
    train_losses.append(train_loss)
    test_losses.append(test_loss)

In [None]:
# Plot the loss function versus iterations for both training and testing datasets
plt.figure(figsize=(10, 6))
plt.plot(np.arange(n_iterations), train_losses, label='Training Data')
plt.plot(np.arange(n_iterations), test_losses, label='Testing Data')
plt.xlabel('Iterations')
plt.ylabel('Hinge Loss')
plt.title('Convergence of Hinge Loss Function')
plt.legend()
plt.grid(True)
plt.show()

# Calculate accuracy on the test set
y_pred = svm_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on the test set:", accuracy)
 
#Confusion Matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
!pip install xgboost


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Load the datasets
train_data = pd.read_csv(train_data.csv")
test_data = pd.read_csv(test_data.csv")

# Encode the target column
label_encoder = LabelEncoder()
train_data['type'] = label_encoder.fit_transform(train_data['type'])
test_data['type'] = label_encoder.transform(test_data['type'])

# Fill missing values in the text data
train_data['Tokenized Text'].fillna('', inplace=True)
test_data['Tokenized Text'].fillna('', inplace=True)

# Define features (X) and target (y)
X_train = train_data['Tokenized Text']
y_train = train_data['type']
X_test = test_data['Tokenized Text']
y_test = test_data['type']

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data into TF-IDF vectors for training and testing sets
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize XGBoost classifier
clf = xgb.XGBClassifier(
    objective='binary:logistic',  # Make sure this matches your type of classification
    learning_rate=0.1,
    n_estimators=100,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    seed=42
)

# Train XGBoost model
clf.fit(X_train_tfidf, y_train)

# Predictions
predictions = clf.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy on the test set: {accuracy * 100:.2f}%")

In [None]:
#confusion matrix for XGboost
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
plot_confusion_matrix(clf, X_test_tfidf, y_test, cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Initialize empty lists to store accuracy values
train_accuracy = []
test_accuracy = []

# Train XGBoost model with evaluation metrics
clf.fit(X_train_tfidf, y_train, eval_metric=['error'], eval_set=[(X_train_tfidf, y_train), (X_test_tfidf, y_test)], verbose=False)

# Record accuracy for each boosting round
eval_results = clf.evals_result()
train_accuracy = 1 - np.array(eval_results['validation_0']['error'])
test_accuracy = 1 - np.array(eval_results['validation_1']['error'])

# Plot accuracy over boosting rounds
plt.figure(figsize=(10, 6))
plt.plot(train_accuracy, label='Training Accuracy')
plt.plot(test_accuracy, label='Testing Accuracy')
plt.xlabel('Boosting Round')
plt.ylabel('Accuracy')
plt.title('Accuracy Over Boosting Rounds')
plt.legend()
plt.grid(True)
plt.show()