In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
#a
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Step 2: Load dataset (assuming 'news_dataset.csv' contains 'text' and 'category' columns)
df = pd.read_csv('/kaggle/input/news-dataset/india-news-headlines.csv')

# Display first few rows to understand the structure
print(df.head())
print(df.columns)

# Step 3: Preprocessing
# Extract features and labels
X = df['headline_text']  # Text articles
y = df['headline_category']  # Corresponding categories

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Bag-of-Words Representation
# Initialize CountVectorizer for converting text data into Bag-of-Words format
vectorizer = CountVectorizer()

# Fit the vectorizer on training data and transform both train and test data
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Step 5: Model Training
# Initialize Naive Bayes classifier
clf = MultinomialNB()

# Train the classifier with training data
clf.fit(X_train_counts, y_train)

# Step 6: Model Evaluation
# Predict categories for the test data
y_pred = clf.predict(X_test_counts)

# Print classification report to evaluate the model's performance
print(classification_report(y_test, y_pred))

# Optional: Predict category for a sample news article
sample_news = ["Researchers have discovered a new species of butterfly in the Amazon rainforest."]
sample_news_counts = vectorizer.transform(sample_news)
predicted_category = clf.predict(sample_news_counts)
print(f"Predicted Category: {predicted_category[0]}")


   publish_date headline_category  \
0      20010101        sports.wwe   
1      20010102           unknown   
2      20010102           unknown   
3      20010102           unknown   
4      20010102           unknown   

                                       headline_text  
0  win over cena satisfying but defeating underta...  
1  Status quo will not be disturbed at Ayodhya; s...  
2                Fissures in Hurriyat over Pak visit  
3              America's unwanted heading for India?  
4                 For bigwigs; it is destination Goa  
Index(['publish_date', 'headline_category', 'headline_text'], dtype='object')


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                            precision    recall  f1-score   support

                                                2010-stars       0.00      0.00      0.00        10
                                        2011-top-slideshow       0.00      0.00      0.00         4
                                          2011-top-stories       0.00      0.00      0.00         5
             2013-the-year-sachin-bids-adieu.football-2013       0.00      0.00      0.00         2
          2013-the-year-sachin-bids-adieu.more-sports-2013       0.00      0.00      0.00         1
               2013-the-year-sachin-bids-adieu.tennis-2013       0.00      0.00      0.00         2
                                2014-sochi-winter-olympics       0.00      0.00      0.00         5
                                                aap-crisis       0.00      0.00      0.00         5
                                                 actresses       0.00      0.00      0.00         1

In [20]:
#c
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Step 1: Load the dataset with a specified encoding
df = pd.read_csv('/kaggle/input/email-spam-detection/spam.csv', encoding='ISO-8859-1')

# Display the first few rows of the dataset
print(df.head())
print(df.columns)  # Check the column names

# Rename the columns for easier access
df.columns = ['label', 'message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']

# Step 2: Data Preprocessing
# Convert labels to binary (0 for non-spam, 1 for spam)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split the dataset into features and labels
X = df['message']  # Ensure this is the correct column name for the messages
y = df['label']    # Spam or non-spam labels

# Tokenization and Padding
max_words = 5000  # Maximum number of words to consider
max_len = 100     # Maximum length of sequences

# Tokenize the text data
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to ensure uniform input size
X_padded = pad_sequences(X_sequences, maxlen=max_len, padding='post')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Step 3: Build the LSTM Model
embedding_dim = 64

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Summary
model.summary()

# Step 4: Train the Model
batch_size = 32
epochs = 5

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

# Step 5: Evaluate the Model
# Predictions on test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Step 6: Testing on a Sample Email
sample_email = ["Congratulations! You've won a free vacation. Click here to claim your prize."]

# Tokenize and pad the sample email
sample_sequence = tokenizer.texts_to_sequences(sample_email)
sample_padded = pad_sequences(sample_sequence, maxlen=max_len, padding='post')

# Predict whether the sample email is spam or not
sample_pred_prob = model.predict(sample_padded)
sample_pred = 'Spam' if sample_pred_prob > 0.5 else 'Not Spam'

print(f"Sample Email Classification: {sample_pred}")


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')




Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 52ms/step - accuracy: 0.8668 - loss: 0.4735 - val_accuracy: 0.8655 - val_loss: 0.3960
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 49ms/step - accuracy: 0.8609 - loss: 0.4250 - val_accuracy: 0.8655 - val_loss: 0.4009
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 50ms/step - accuracy: 0.8696 - loss: 0.4095 - val_accuracy: 0.8655 - val_loss: 0.4009
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 51ms/step - accuracy: 0.8660 - loss: 0.4141 - val_accuracy: 0.8655 - val_loss: 0.3956
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 50ms/step - accuracy: 0.8628 - loss: 0.4186 - val_accuracy: 0.8655 - val_loss: 0.3963
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step
Test Accuracy: 0.8655
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#b


import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Step 1: Load and Preprocess Data
# Load the dataset into a pandas DataFrame (assuming 'headline_text' and 'headline_category' columns)
df = pd.read_csv('/kaggle/input/news-dataset/india-news-headlines.csv')  # Update with your dataset file

# Drop missing values
df.dropna(inplace=True)

# Map categories to numerical labels
category_to_id = {category: idx for idx, category in enumerate(df['headline_category'].unique())}
df['headline_category'] = df['headline_category'].map(category_to_id)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['headline_text'], df['headline_category'], test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize and encode sequences
def encode_data(texts, max_length=128):
    return tokenizer(
        list(texts),
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length=max_length,    # Pad & truncate all sentences.
        padding='max_length',     # Pad to the maximum length
        truncation=True,          # Truncate longer sentences
        return_attention_mask=True,  # Construct attention masks
        return_tensors='pt'  # Return PyTorch tensors
    )

# Tokenize and encode the sequences
train_encodings = encode_data(X_train)
test_encodings = encode_data(X_test)

# Convert labels to tensors
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

# Step 2: Prepare Data for Model
# Create TensorDataset for training and validation data
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

# Create DataLoader for efficient batch processing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Step 3: Load Pre-trained BERT Model for Sequence Classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(category_to_id))

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Optimizer and Learning Rate Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Step 4: Train the Model
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    
    for batch in tqdm(train_loader, desc=f'Training Epoch {epoch+1}'):
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        
        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_train_loss / len(train_loader)
    print(f'Average training loss for epoch {epoch+1}: {avg_train_loss}')

# Step 5: Evaluate the Model
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.append(labels.cpu().numpy())

# Flatten predictions and true labels
predictions = [item for sublist in predictions for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Compute evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print(f'Validation Accuracy: {accuracy}')
print(f'Validation F1-Score: {f1}')

# Step 6: Make Predictions on New Data
sample_text = ["Researchers have discovered a new species of butterfly in the Amazon rainforest."]
sample_encoding = encode_data(sample_text)
sample_input_ids = sample_encoding['input_ids'].to(device)
sample_attention_mask = sample_encoding['attention_mask'].to(device)

with torch.no_grad():
    outputs = model(sample_input_ids, attention_mask=sample_attention_mask)
    prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

# Map numerical prediction back to category
category_prediction = list(category_to_id.keys())[list(category_to_id.values()).index(prediction)]
print(f"Predicted Category: {category_prediction}")
