In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import pipeline, linear_model, model_selection, metrics, multioutput, svm, preprocessing
from sklearn.model_selection import TunedThresholdClassifierCV
import fasttext


# Create vectorizer

In [3]:
# change for subtitle dataset
X_train = pd.read_csv('X_train_sum.csv')
y_train = pd.read_csv('y_train_sum.csv')
X_test = pd.read_csv('X_test_sum.csv')
y_test = pd.read_csv('y_test_sum.csv')
target_names = y_test.columns.values

In [4]:
"""Train and evaluate model with given features, and save predictions."""
def evalmodel(X_train, X_test, y_train, y_test, target_names, output_file):
    clf = LogisticRegression(penalty='l2', class_weight='balanced', random_state=0, solver='liblinear', max_iter=1000)
    clf_tuned = TunedThresholdClassifierCV(clf, cv=10, n_jobs=-1)
    model = multioutput.MultiOutputClassifier(clf_tuned)
    model.fit(X_train, y_train)
    
    # Generate predictions
    y_pred = model.predict(X_test)
    
    # Save predictions to a CSV file
    predictions_df = pd.DataFrame(y_pred, columns=target_names)
    predictions_df.to_csv(output_file, index=False)
    
    # Generate classification report and accuracy
    scores = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=target_names, zero_division=0, output_dict=True)).T
    accuracy = metrics.accuracy_score(y_test, y_pred)
    
    return model, scores, accuracy


In [5]:
def test_fit(X_test, vectorizer):
    X_test = vectorizer.transform(X_test['summary'])
    return X_test

In [6]:
def prediction(X_train, y_train, X_test, y_test, n1, n2, max_words, target_names, output_file):
    vectorizer = TfidfVectorizer(ngram_range = (n1,n2), max_features = max_words, norm = 'l2', sublinear_tf= True)
    vec = vectorizer.fit_transform(X_train['summary'])
    X_t = test_fit(X_test, vectorizer)
    model, scores, accuracy = evalmodel(vec, X_t, y_train, y_test, target_names, output_file)
    return model, scores, accuracy

In [12]:
# adjust n in n-grams, max_words and output_file
prediction(X_train, y_train, X_test, y_test, 1, 2, 5000, target_names,"results/summary/10f/sum_logreg_bigram_5000.csv")

(MultiOutputClassifier(estimator=TunedThresholdClassifierCV(cv=10,
                                                            estimator=LogisticRegression(class_weight='balanced',
                                                                                         max_iter=1000,
                                                                                         random_state=0,
                                                                                         solver='liblinear'),
                                                            n_jobs=-1)),
                           precision    recall  f1-score  support
 husband and wife           0.476190  0.769231  0.588235     26.0
 infatuation                0.407407  0.611111  0.488889     18.0
 friendship                 0.193548  0.900000  0.318584     20.0
 romantic love              0.147541  0.642857  0.240000     14.0
 the desire for vengeance   0.242424  0.500000  0.326531     16.0
 humanoid robot             0.6

# SVM

In [15]:
def evalmodel2(X_train, X_test, y_train, y_test, target_names, output_file):
    """Train and evaluate model with given features, and save predictions."""
    clf = svm.LinearSVC(class_weight='balanced', random_state=0)
    clf_tuned = TunedThresholdClassifierCV(clf, cv=10, n_jobs=-1)
    model = multioutput.MultiOutputClassifier(clf_tuned)
    model.fit(X_train, y_train)
    
    # Generate predictions
    y_pred = model.predict(X_test)
    
    # Save predictions to a CSV file
    predictions_df = pd.DataFrame(y_pred, columns=target_names)
    predictions_df.to_csv(output_file, index=False)
    
    # Generate classification report and accuracy
    scores = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=target_names, zero_division=0, output_dict=True)).T
    accuracy = metrics.accuracy_score(y_test, y_pred)
    
    return model, scores, accuracy


In [15]:
def prediction2(X_train, y_train, X_test, y_test, n1, n2, max_words, target_names, output_file):
    vectorizer = TfidfVectorizer(ngram_range = (n1,n2), max_features = max_words, norm = 'l2', sublinear_tf= True)
    vec = vectorizer.fit_transform(X_train['summary'])
    X_t = test_fit(X_test, vectorizer)
    model, scores, accuracy = evalmodel2(vec, X_t, y_train, y_test, target_names, output_file)
    return model, scores, accuracy

In [17]:
# adjust n in n-grams, max_words and output_file
prediction2(X_train, y_train, X_test, y_test, 1, 2, 10000, target_names, "results/summary/10f/sum_svm_bigram_10000.csv")

(MultiOutputClassifier(estimator=TunedThresholdClassifierCV(cv=10,
                                                            estimator=LinearSVC(class_weight='balanced',
                                                                                random_state=0),
                                                            n_jobs=-1)),
                           precision    recall  f1-score  support
 husband and wife           0.512821  0.769231  0.615385     26.0
 infatuation                0.375000  0.666667  0.480000     18.0
 friendship                 0.218391  0.950000  0.355140     20.0
 romantic love              0.180000  0.642857  0.281250     14.0
 the desire for vengeance   0.294118  0.625000  0.400000     16.0
 humanoid robot             0.576923  0.937500  0.714286     16.0
 father and son             0.400000  0.923077  0.558140     13.0
 human vs. captivity        0.320000  0.727273  0.444444     11.0
 time travel                0.818182  0.750000  0.782609     12.

# Fast text

In [8]:
# preprocessing summaries as proper input for fast text
train_texts = X_train['summary'].str.replace('\n', ' ', regex=False).str.strip().tolist()
test_texts = X_test['summary'].str.replace('\n', ' ', regex=False).str.strip().tolist()

In [9]:
train_labels = list(X_train['theme'])
test_labels = list(X_test['theme'])

## Logistic Regression

In [13]:
# Load a pre-trained FastText model
# make sure to download & change file path before running
model_ft = fasttext.load_model('cc.en.300.bin')

# Generate embeddings for each text in train and test data
# note whole summary is processed as a "sentence"
train_embeddings = np.array([model_ft.get_sentence_vector(text) for text in train_texts])
test_embeddings = np.array([model_ft.get_sentence_vector(text) for text in test_texts])
model, scores, accuracy = evalmodel(train_embeddings, test_embeddings, y_train, y_test, target_names,"results/sum_fast_logreg.csv")

scores, accuracy

(                          precision    recall  f1-score  support
 husband and wife           0.432432  0.615385  0.507937     26.0
 infatuation                0.232558  0.555556  0.327869     18.0
 friendship                 0.180000  0.450000  0.257143     20.0
 romantic love              0.137255  0.500000  0.215385     14.0
 the desire for vengeance   0.140351  0.500000  0.219178     16.0
 humanoid robot             0.451613  0.875000  0.595745     16.0
 father and son             0.222222  0.615385  0.326531     13.0
 human vs. captivity        0.173913  0.727273  0.280702     11.0
 time travel                0.189189  0.583333  0.285714     12.0
 greed for riches           0.187500  0.600000  0.285714     15.0
 micro avg                  0.220183  0.596273  0.321608    161.0
 macro avg                  0.234703  0.602193  0.330192    161.0
 weighted avg               0.250354  0.596273  0.343801    161.0
 samples avg                0.239664  0.602067  0.318691    161.0,
 0.031007

## SVM

In [16]:
# Load a pre-trained FastText model
model_ft = fasttext.load_model('cc.en.300.bin')

# Generate embeddings for each text in train and test data
# note whole summary is processed as a "sentence"
train_embeddings = np.array([model_ft.get_sentence_vector(text) for text in train_texts])
test_embeddings = np.array([model_ft.get_sentence_vector(text) for text in test_texts])


model, scores, accuracy = evalmodel2(train_embeddings, test_embeddings, y_train, y_test, target_names,"results/sum_fast_svm.csv")

scores, accuracy

(                          precision    recall  f1-score  support
 husband and wife           0.404762  0.653846  0.500000     26.0
 infatuation                0.235294  0.444444  0.307692     18.0
 friendship                 0.147826  0.850000  0.251852     20.0
 romantic love              0.126984  0.571429  0.207792     14.0
 the desire for vengeance   0.145455  0.500000  0.225352     16.0
 humanoid robot             0.500000  0.875000  0.636364     16.0
 father and son             0.258065  0.615385  0.363636     13.0
 human vs. captivity        0.177778  0.727273  0.285714     11.0
 time travel                0.200000  0.416667  0.270270     12.0
 greed for riches           0.217391  0.666667  0.327869     15.0
 micro avg                  0.212810  0.639752  0.319380    161.0
 macro avg                  0.241355  0.632071  0.337654    161.0
 weighted avg               0.253366  0.639752  0.349711    161.0
 samples avg                0.227593  0.651163  0.317682    161.0,
 0.023255