# Predict which franchise a quote belongs to - SVM

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Load data (Sw & St scripts)

In [None]:
# load script for StarTrek
path = ('data/StScriptTNG.csv')
StarTrek_raw_df = pd.read_csv(path)
# display first 5 rows
StarTrek_raw_df.head()

The Star Wars script is structured such as: "ID" "character" "dialogue"
* Uses ' ' as separator
* " as quote
* skip bad lines - potential glitchs or errors -

In [None]:
# load script for StarWars 
StarWars_raw_df = pd.read_csv(
    'data/SwScriptVI.txt', 
    sep=' ', 
    quotechar='"', 
    on_bad_lines='skip'
)
# display first 5 rows
StarWars_raw_df.head()

* **Star Wars script are scripts line by line**
* **Star Trek script are scripts blocks by blocks**
* Turn Star Trek script into line by line

## Prepare Star Wars script

In [None]:
# drop character and ID columns
sw_df = StarWars_raw_df[['dialogue']].copy()
sw_df.columns = ['text']
sw_df['label'] = 0 # label for StarWars


st_df = StarTrek_raw_df[['script']].copy()
st_df.columns = ['text']

## Prepare Star Trek script

In [None]:
# 2+ uppercase letters followed by a colon
pattern = r'([A-Z\s]{2,}:)'

def extract_lines(script_block):
    # split block by char name
    parts = re.split(pattern, script_block)
    
    lines = []
    for i in range(1, len(parts)-1, 2):
        name = parts[i].replace(':', '').strip()
        dialogue = parts[i+1].strip()
        lines.append({'Character': name, 'Dialogue': dialogue})
    return lines

st_lines = []
for script in StarTrek_raw_df['script']:
    st_lines.extend(extract_lines(str(script)))

st_df = pd.DataFrame(st_lines)
st_df = st_df.drop(columns=['Character'])
st_df.rename(columns={'Dialogue': 'text'}, inplace=True)
st_df['label'] = 1 # Star Trek

## Balance rows

60,094 rows of Star Trek script vs 672 rows of Star Wars script
* need to balance it to our bottleneck (lowest)

In [None]:
# match size of smaller dataset
min_size = len(sw_df)

# randomly pick min_size samples from larger dataset
st_df_balanced = st_df.sample(n=min_size, random_state=42)

# combine datasets
df_final = pd.concat([sw_df, st_df_balanced]).reset_index(drop=True)

print(df_final['label'].value_counts())


## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_final['text'],
    df_final['label'],
    test_size=0.2,
    random_state=42,
)

In [None]:
# remove stop words - cleaning noise -
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1, 2))

# fit on train, transform both train and test
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train_tfidf, y_train)

In [None]:
# predict on test set
y_pred = svm_model.predict(X_test_tfidf)

# evaluation report ( confusion matrix )
print(classification_report(y_test, y_pred, target_names=['StarWars', 'StarTrek']))

In [None]:
# Test a custom quote with probability
sample_quote = ["Set phasers to stun"]
sample_tfidf = vectorizer.transform(sample_quote)

# Predict Probability
probs = svm_model.predict_proba(sample_tfidf)
# probs[0][0] is Star Wars, probs[0][1] is Star Trek
print(f"Star Wars Probability: {probs[0][0]:.2%}")
print(f"Star Trek Probability: {probs[0][1]:.2%}")