In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Use the entire dataset for training
X_train = train_df['text']
y_train = train_df['target']

# Create a pipeline with TF-IDF vectorizer and Support Vector Machine classifier
classifier = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(kernel='linear'))  # You can experiment with different kernels
])

# Train the classifier on the entire dataset
classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = classifier.predict(test_df['text'])

# Save predictions to a CSV file
submission_df = pd.DataFrame({'id': test_df['id'], 'target': predictions})
submission_df.to_csv('submission.csv', index=False)


2644    So you have a new weapon that can cause un-ima...
2227    The f$&amp;@ing things I do for #GISHWHES Just...
5448    DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...
132     Aftershock back to school kick off was great. ...
6845    in response to trauma Children of Addicts deve...
                              ...                        
1835                @SmusX16475 Skype just crashed u host
506     Christian Attacked by Muslims at the Temple Mo...
3592    Man charged over fatal crash near Dubbo refuse...
6740    #usNWSgov Severe Weather Statement issued Augu...
1634    Great British &lt;b&gt;Bake&lt;/b&gt; Off's ba...
Name: text, Length: 1523, dtype: object


In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

# Load the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize and preprocess the text data
def tokenize_data(data, max_len=128):
    input_ids = []
    attention_masks = []

    for text in data['text']:
        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )

        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    if 'target' in data.columns:
        labels = torch.tensor(data['target'].values)
        return TensorDataset(input_ids, attention_masks, labels)
    else:
        return TensorDataset(input_ids, attention_masks)

train_dataset = tokenize_data(train_df)
test_dataset = tokenize_data(test_df)

# Define DataLoader for training and test sets
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up training parameters
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3

# Fine-tune BERT on the entire training set
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Make predictions on the test set
model.eval()
all_preds_test = []

for batch in test_dataloader:
    input_ids, attention_mask = batch
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    preds_test = torch.argmax(logits, dim=1).cpu().numpy()
    all_preds_test.extend(preds_test)

# Save predictions to a CSV file
submission_df = pd.DataFrame({'id': test_df['id'], 'target': all_preds_test})
submission_df.to_csv('submission.csv', index=False)


ModuleNotFoundError: No module named 'transformers'