In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
nltk.download('wordnet')

# Load the data
train_data = pd.read_csv('/content/Train.csv')
val_data = pd.read_csv('/content/Valid.csv')
test_data = pd.read_csv('/content/Test.csv')



# Lemmatization
lemmatizer = WordNetLemmatizer()
train_data['text'] = train_data['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Custom stopwords
custom_stopwords = set(['list', 'of', 'custom', 'stopwords'])
stopwords = ENGLISH_STOP_WORDS.union(custom_stopwords)
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords)

# Data preparation
train_data['text'] = train_data['text'].str.lower()
train_data['text'] = train_data['text'].str.replace('[^\w\s]','')
train_data['text'] = train_data['text'].str.replace('\d+', '')
train_data['text'] = train_data['text'].str.strip()

val_data['text'] = val_data['text'].str.lower()
val_data['text'] = val_data['text'].str.replace('[^\w\s]','')
val_data['text'] = val_data['text'].str.replace('\d+', '')
val_data['text'] = val_data['text'].str.strip()

# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data['text'], train_data['label'], test_size=0.2, random_state=42)
X_val = val_data['text']
y_val = val_data['label']

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Hyperparameter tuning using grid search
C_values = [0.1, 1, 10, 100]
max_iter_values = [1000, 10000, 100000]
param_grid = {'C': C_values, 'max_iter': max_iter_values}
svm = LinearSVC()
grid_search = GridSearchCV(svm, param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

# Best hyperparameters
best_C = grid_search.best_params_['C']
best_max_iter = grid_search.best_params_['max_iter']
print('Best hyperparameters:', grid_search.best_params_)

# Train the final model with best hyperparameters
svm = LinearSVC(C=best_C, max_iter=best_max_iter)
X_full = pd.concat([train_data['text'], val_data['text']], axis=0)
y_full = pd.concat([train_data['label'], val_data['label']], axis=0)
X_full_tfidf = tfidf_vectorizer.fit_transform(X_full)
svm.fit(X_full_tfidf, y_full)

# Model evaluation on test set
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred_test = svm.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred_test)
print('Test accuracy:', accuracy)

[nltk_data] Downloading package wordnet to /root/nltk_data...
  train_data['text'] = train_data['text'].str.replace('[^\w\s]','')
  train_data['text'] = train_data['text'].str.replace('\d+', '')
  val_data['text'] = val_data['text'].str.replace('[^\w\s]','')
  val_data['text'] = val_data['text'].str.replace('\d+', '')


Best hyperparameters: {'C': 1, 'max_iter': 1000}
Test accuracy: 0.99025


In [2]:
# Additional steps to improve accuracy
# 1. Increase the size of the training data
X_train_full = pd.concat([train_data['text'], val_data['text']], axis=0)
y_train_full = pd.concat([train_data['label'], val_data['label']], axis=0)
X_train_full_tfidf = tfidf_vectorizer.transform(X_train_full)
svm.fit(X_train_full_tfidf, y_train_full)

# 2. Try different values for C and max_iter
C_values = [0.01, 0.1, 1, 10, 100]
max_iter_values = [1000, 10000, 100000]
param_grid = {'C': C_values, 'max_iter': max_iter_values}
grid_search = GridSearchCV(svm, param_grid, cv=5)
grid_search.fit(X_train_full_tfidf, y_train_full)

# Best hyperparameters
best_C = grid_search.best_params_['C']
best_max_iter = grid_search.best_params_['max_iter']
print('Best hyperparameters:', grid_search.best_params_)

# Train the final model with the best hyperparameters
svm = LinearSVC(C=best_C, max_iter=best_max_iter)
svm.fit(X_train_full_tfidf, y_train_full)

# Model evaluation on test set
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred_test = svm.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred_test)
print('Test accuracy:', accuracy)



Best hyperparameters: {'C': 0.1, 'max_iter': 1000}
Test accuracy: 0.937


In [1]:
# 

In [31]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Create and fit the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_full)

with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('model.pkl', 'rb') as file:
    model = pickle.load(file)



with open('X_tra_tfidf.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

with open('X_tra_tfidf.pkl', 'rb') as file:
    tfidf_vectorizer = pickle.load(file)

# Perform transformation on the test data using the loaded vectorizer
x = tfidf_vectorizer.transform([X_val[127]])
# print(x)
# Make predictions on the transformed test data using the loaded model
y_pred_test = model.predict(x)

  (0, 163557)	0.1023711757343013
  (0, 157937)	0.1273416287832618
  (0, 147609)	0.10792221141523038
  (0, 146809)	0.05066695123922059
  (0, 145727)	0.06286042017270566
  (0, 145024)	0.15609797826242466
  (0, 142876)	0.10936512973880876
  (0, 140768)	0.22983559041771545
  (0, 140675)	0.14465634060838314
  (0, 136865)	0.15556789538441032
  (0, 136225)	0.20424907960385158
  (0, 131140)	0.12262118122234779
  (0, 129010)	0.10973514025156242
  (0, 126865)	0.0895271288715064
  (0, 126316)	0.10079004069083665
  (0, 124366)	0.0901807788928747
  (0, 122566)	0.08070483270639874
  (0, 119694)	0.13785636278511637
  (0, 116117)	0.11060911237466318
  (0, 111563)	0.12515380965373774
  (0, 110865)	0.07999547828422703
  (0, 110304)	0.13907700982115617
  (0, 108152)	0.11758524701332583
  (0, 107972)	0.0993453918230032
  (0, 106811)	0.10529063340383536
  :	:
  (0, 82836)	0.0445446515999761
  (0, 78181)	0.08415043022211138
  (0, 74507)	0.11344855149962992
  (0, 74172)	0.08062841024965584
  (0, 70051)	0.073

In [30]:
y_pred_test

array([1])

In [35]:
[X_val[2]]

['the guidelines state that a comment must contain a minimum of four lines that is the only reason i am saying anything more about tomcats because after all my one line summary really says everything there is to say there is absolutely nothing remotely entertaining in this film']

In [34]:
x = tfidf_vectorizer.transform([X_val[2]])
print(x)
# Make predictions on the transformed test data using the loaded model
y_pred_test = model.predict(x)
y_pred_test


  (0, 147713)	0.42832562697341464
  (0, 140704)	0.2600426417605123
  (0, 137684)	0.2118262890256179
  (0, 125599)	0.24195033192660448
  (0, 125565)	0.19026238631496886
  (0, 125542)	0.11589472679246648
  (0, 119740)	0.2600426417605123
  (0, 118025)	0.1508725963141069
  (0, 117915)	0.09667866813120438
  (0, 92091)	0.2951302263048865
  (0, 83150)	0.21661613309114133
  (0, 83108)	0.14957184603430104
  (0, 61383)	0.4126863218308397
  (0, 51222)	0.06925815059992299
  (0, 45517)	0.1746615607725019
  (0, 29612)	0.2656478157106921
  (0, 28078)	0.18530596396472493
  (0, 582)	0.17095393057334457


array([0])

In [14]:
# Assuming your trained model object is named 'model'
with open('X_train_tfidf.pkl', 'wb') as file:
    pickle.dump(X_train_tfidf, file)

In [16]:
with open('X_train_tfidf.pkl', 'rb') as file:
    model = pickle.load(file)

In [3]:
# Generate predictions on the competition test set
X_comp_test_tfidf = tfidf_vectorizer.transform(test_data['text'])
comp_test_predictions = svm.predict(X_comp_test_tfidf)

# Prepare submission file
submission_df = pd.DataFrame({'Id': test_data.iloc[:, 0], 'Label': comp_test_predictions})

# Save submission file
submission_df.to_csv('/content/submission.csv', index=False)

In [4]:
import pickle

In [5]:
# Assuming your trained model object is named 'model'
with open('model.pkl', 'wb') as file:
    pickle.dump(svm, file)

In [6]:
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

In [7]:
# Assuming your trained model object is named 'model'
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

In [8]:
with open('tfidf_vectorizer.pkl', 'rb') as file:
    model = pickle.load(file)