## Star Trek Dialogue Analysis - Set Up

In [1]:
import json
import pandas as pd
import os
from itertools import chain
# import re
# import numpy as np
# import logging
# from collections import Counter

# # Spacy
# import spacy

# # NLTK
# import nltk
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer

# # Gensim
# import gensim
# from gensim.utils import simple_preprocess
# from gensim import corpora
# from gensim.models import LdaModel
# from pprint import pprint
# from gensim.models import CoherenceModel

# # Plotting
# import seaborn as sns
# import pyLDAvis
# import pyLDAvis.gensim_models
# import matplotlib.pyplot as plt
# %matplotlib inline

In [2]:
path = r'C:\Users\anon\Documents\CareerFoundry\Data Analytics Immersion\6\Data\Raw Data'

In [3]:
# Reading in the whole data set as dictionary

with open(os.path.join(path, 'StarTrekDialogue.json'), 'r') as read_file:
    all_series = json.load(read_file)

### Wrangling & Set Up

In [4]:
# Define series variables and additional data sources for gender, etc. to be mapped, merged, and used to create series specific dataframes

series_dict = {
    'tos': {'data': all_series['TOS'], 'data_file': os.path.join(path, 'tos_data.csv'), 'gender_file': os.path.join(path, 'tos_gender.csv')},
    'tas': {'data': all_series['TAS'], 'data_file': os.path.join(path, 'tas_data.csv'), 'gender_file': os.path.join(path, 'tas_gender.csv')},
    'tng': {'data': all_series['TNG'], 'data_file': os.path.join(path, 'tng_data.csv'), 'gender_file': os.path.join(path, 'tng_gender.csv')},
    'ds9': {'data': all_series['DS9'], 'data_file': os.path.join(path, 'ds9_data.csv'), 'gender_file': os.path.join(path, 'ds9_gender.csv')},
    'voy': {'data': all_series['VOY'], 'data_file': os.path.join(path, 'voy_data.csv'), 'gender_file': os.path.join(path, 'voy_gender.csv')},
    'ent': {'data': all_series['ENT'], 'data_file': os.path.join(path, 'ent_data.csv'), 'gender_file': os.path.join(path, 'ent_gender.csv')},
    'dis': {'data': all_series['DIS'], 'data_file': os.path.join(path, 'dis_data.csv'), 'gender_file': os.path.join(path, 'dis_gender.csv')},
    'pic': {'data': all_series['PIC'], 'data_file': os.path.join(path, 'pic_data.csv'), 'gender_file': os.path.join(path, 'pic_gender.csv')},
}

In [5]:
# Define main cast variables

main_cast = {
    'tos': ['KIRK', 'SPOCK', 'UHURA', 'CHEKOV', 'SULU', 'CHAPEL', 'COMPUTER', 'MCCOY', 'SCOTT'],
    'tas': ['KIRK', 'SPOCK', 'UHURA', 'CHEKOV', 'SULU', 'CHAPEL', 'COMPUTER', 'MCCOY', 'SCOTT'],
    'tng': ['PICARD', 'RIKER', 'WORF', 'DATA', 'TROI', 'CRUSHER', 'TASHA', 'CHIEF', "O'BRIEN", 'GUINAN', 'LAFORGE', 'PULASKI', 'WESLEY'],
    'ds9': ['SISKO', 'ODO', 'KIRA', 'JAKE', 'QUARK', 'DAX', "O'BRIEN", 'BASHIR', 'WORF', 'EZRI'],
    'voy': ['JANEWAY', 'CHAKOTAY', 'TUVOK', 'PARIS', 'TORRES', 'KIM', 'EMH', 'NEELIX', 'KES', 'SEVEN', 'ICHEB', 'SESKA'],
    'ent': ['ARCHER', 'DEGRA', 'HOSHI', 'PHLOX', 'REED', 'SHRAN', "T'POL", 'TRAVIS', 'TUCKER'],
    'dis': ['BURNHAM', 'SARU', 'VOQ', 'TYLER', 'STAMETS', 'TILLY', 'LORCA', 'CULBER', 'PIKE', 'BOOK', 'NHAN', 'ADIRA', 'GRAY', 'GEORGIOU', 'DETMER', 'OWOSEKUN', "L'RELL", 'SAREK', 'CORNWELL', 'AIRIAM', 'SPOCK'],
    'pic': ['PICARD', 'AGNES', 'DAHJ', 'DATA', 'ELNOR', 'HUGH', 'SOJI', 'RAFFI', 'RIOS', 'NAREK', 'SEVEN', 'RIZZO']
}

In [6]:
# Helper function to split Dialogue into a list of lists

# def transform_dialogue(dialogue):
#     return [item.split(', ') for item in dialogue]

In [7]:
# Helper function to split Dialogue into a string

# def join_dialogue(dialogue):
# #     return [item.concat(', ') for item in dialogue]
#     thingy = ''
#     for item in dialogue: 
#         thingy = thingy + ' ' + item
#     return thingy

def join_dialogue(dialogue):
    return ' '.join(dialogue)

In [8]:
# Helper function to filter out dialogue spoken by characters other than the main cast

def remove_secondary_cast_dialogue(df, main_cast_list):
    return df[df['Character'].isin(main_cast_list)]

In [9]:
# Helper function to merge in data (gender, etc.) from additional csv files

def merge_additional_data(df, data_file, gender_file):
    additional_data = pd.read_csv(os.path.join(path, data_file), index_col=0, delimiter=';', encoding='latin1')
    df = df.merge(additional_data, left_on='Episode', right_index=True)

    gender_mapping = pd.read_csv(os.path.join(path, gender_file), header=None, index_col=0, delimiter=';').squeeze("columns").to_dict()

    df['Gender'] = df['Character'].map(gender_mapping)

    return df

In [10]:
# Create a dataframe for each series

series_dataframes = {}

def create_series_df(series_name, series_data, main_cast_list, data_file, gender_file):
    if series_name not in series_dict:
        print('Series not recognized')
        return

    # Transform into dataframe:
    series_df = pd.concat({k: pd.Series(v) for k, v in series_data.items()}).reset_index()
    series_df.columns = ['Episode', 'Character', 'Dialogue']
    
    # Remove line break characters from Dialogue    series_df['Dialogue'] = series_df['Dialogue'].apply(lambda dialogues: [line.replace('\r', '') for line in dialogues])
    series_df['Dialogue'] = series_df['Dialogue'].apply(lambda dialogues: [line.replace('\r', '') for line in dialogues])

    # Get length of dialogue
    series_df['Dialogue Length'] = series_df['Dialogue'].str.len()
    
    # Transform 'Dialogue' into a single string
    series_df['Flattened Dialogue'] = series_df['Dialogue'].apply(join_dialogue)

    # Drop dialogue not spoken by main cast
    series_df = remove_secondary_cast_dialogue(series_df, main_cast_list)

    # Merge columns regarding gender, etc. from additional data sources
    series_df = merge_additional_data(series_df, data_file, gender_file)

    series_df = series_df[['Episode', 'Season', 'Year', 'Title', 'Character', 'Gender', 'Dialogue', 'Flattened Dialogue', 'Dialogue Length']]

    series_dataframes[series_name] = series_df  # Store the DataFrame in the dictionary
    
    
    series_df.to_csv(r'C:\Users\anon\Documents\CareerFoundry\Data Analytics Immersion\6\Data\Altered Data\{}_df_cleaned.csv'.format(series_name), index=False)

    return series_df

In [11]:
# Create a dataframe for each series

for series_name, series_info in series_dict.items():
    create_series_df(series_name, series_info['data'], main_cast[series_name],
                     series_info['data_file'], series_info['gender_file'])

In [12]:
# Save each series into it's own dataframe

tos_dataframe = series_dataframes['tos']
tas_dataframe = series_dataframes['tas']
tng_dataframe = series_dataframes['tng']
ds9_dataframe = series_dataframes['ds9']
ent_dataframe = series_dataframes['ent']
dis_dataframe = series_dataframes['dis']
pic_dataframe = series_dataframes['pic']
voy_dataframe = series_dataframes['voy']

In [13]:
tos_dataframe.head(2)

Unnamed: 0,Episode,Season,Year,Title,Character,Gender,Dialogue,Flattened Dialogue,Dialogue Length
0,tos_000,tos_s1,1966,The Cage,SPOCK,m,"[Check the circuit., It can't be the screen th...",Check the circuit. It can't be the screen then...,27
21,tos_001,tos_s1,1966,The Man Trap,KIRK,m,"[Shall we pick some flowers, Doctor? When a ma...","Shall we pick some flowers, Doctor? When a man...",115


In [14]:
tos_dataframe['Flattened Dialogue'][0]

"Check the circuit. It can't be the screen then. Definitely something out there, Captain, headed this way. Their call letters check with a survey expedition. SS Columbia. It disappeared in that region approximately eighteen years ago. Records show the Talos group has never been explored. Solar system similar to Earth, eleven planets. Number four seems to be Class M, oxygen atmosphere. We aren't going to go, to be certain? Mister Spock here. We're intercepting a follow-up message, sir. There are crash survivors on Talos. Preliminary lab survey ready, sir. Yes, sir. Spock here. There is no survivors' encampment, Number One. This is all some sort of trap. We've lost the Captain. Do you read? The inhabitants of this planet must live deep underground, and probably manufacture food and other needs down there. Our tests indicate the planet surface, without considerably more vegetation or some animals, simply too barren to support life. Exactly. An illusion placed in our minds by this planet's