In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# String kernel from sklearn
from sklearn.metrics.pairwise import polynomial_kernel
from sklearn.model_selection import train_test_split

df = pd.read_csv('processed_train_data.csv')
tweets, labels = df['text'], df['target']

# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Define a pipeline that combines a TfidfVectorizer and an SVM with a string kernel
svm = SVC(kernel='precomputed')

# Fit the pipeline on the training data
svm.fit(X_train.toarray(), y_train)

# Use the pipeline to predict the labels for the test data
y_pred = svm.predict(X_test.toarray())

# Calculate the accuracy of the predictions
accuracy = np.mean(y_pred == y_test)
print(f'Test accuracy: {accuracy:.2f}')

test = pd.read_csv('processed_test_data.csv')['text'].map(str)
test_vector = vectorizer.transform(test).toarray()

predictions = svm.predict(test_vector)

csv = 'id,target\n'
for id, pred in enumerate(predictions):
  csv += '{},{}\n'.format(id, pred)
with open('submission.csv', 'w') as f:
  f.writelines(csv)