In [13]:
import os
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
import lime
import lime.lime_text
import matplotlib.pyplot as plt

In [14]:
base_data_path = 'IMDB Dataset.csv'
text_col = 'review'
label_col = 'sentiment'
class_names = ['negative', 'positive']
max_features = 5000
n_splits = 5
random_state = 42
sample_idx = 0
output_dir = 'outputs_svc'
os.makedirs(output_dir, exist_ok=True)

In [15]:
df = pd.read_csv(base_data_path)
X = df[text_col]
y = df[label_col]

In [None]:
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
accuracies = []
models = []
vectorizers = []

startTime = time.time()
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), start=1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    vect = TfidfVectorizer(max_features=max_features)
    X_train_tfidf = vect.fit_transform(X_train)
    X_val_tfidf = vect.transform(X_val)

    clf = LinearSVC(random_state=random_state)
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_val_tfidf)
    acc = accuracy_score(y_val, y_pred)
    accuracies.append(acc)
    models.append(clf)
    vectorizers.append(vect)
    print(f"LinearSVC Fold {fold} Accuracy: {acc:.4f}")
endTime = time.time()

best_idx = int(np.argmax(accuracies))
best_model = models[best_idx]
best_vect = vectorizers[best_idx]
val_idx = list(kf.split(X, y))[best_idx][1]
X_val_best = X.iloc[val_idx].reset_index(drop=True)

print(f"\nLinearSVC Avg Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print("Time : " , round((endTime - startTime) / 5 , 2) , "seconds")

# LIME explanation
sample_text = X_val_best.iloc[sample_idx]
print(f"\nSample Text for LIME:\n{sample_text}")
pipeline = make_pipeline(best_vect, best_model)
explainer = lime.lime_text.LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(
    sample_text,
    lambda x: np.array([[1 - p, p] for p in best_model.decision_function(best_vect.transform(x))]),
    num_features=10
)
exp.show_in_notebook(text=sample_text)
exp.save_to_file(os.path.join(output_dir, 'lime_explanation_LinearSVC.html'))
print("\nLIME for LinearSVC saved.")


LinearSVC Fold 1 Accuracy: 0.8895
LinearSVC Fold 2 Accuracy: 0.8872
LinearSVC Fold 3 Accuracy: 0.8853
LinearSVC Fold 4 Accuracy: 0.8873
LinearSVC Fold 5 Accuracy: 0.8883

LinearSVC Avg Accuracy: 0.8875 ± 0.0014
 / Time :  8.77 seconds

Sample Text for LIME:
Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines.<br /><br />At first it was very odd and pretty funny but as the movie progressed I didn't find the jokes or oddness funny anymore.<br /><br />Its a low budget film (thats never a problem in itself), there were some pretty interesting characters, but eventually I just lost interest.<br /><br />I imagine this film would appeal to a stoner who is currently partaking.<br /><br />For something similar but better try "Brother from another planet"


KeyboardInterrupt: 