# Preparing Transcript Data for LLMs

We want to prepare the transcript data for use by AI learning models. This will include separating the data into training, validation, and test sets.

In [1]:
%run ../../src/config.py
%run ../../src/read_data.py
%run ../../src/build_data.py

In [2]:
relative_data_dir = f'../../{data_dir}'
relative_db_file = f'{relative_data_dir}/{db_file}'

campaign_db = BuildCampaignDatabase(
    GetTranscriptDatabase(relative_db_file),
    drop_cols       = ['link', 'download_date'],
    remove_episodes = episodes_to_remove
)
campaign_db

Unnamed: 0,campaign_no,arc_no,episode_no,campaign,arc,episode,transcript_file,episode_index,episode_label
0,1,1,1,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Arrival at Kraghammer,section001/subsection001/episode001.csv,1.0,1-1-01
1,1,1,2,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Into the Greyspine Mines,section001/subsection001/episode002.csv,2.0,1-1-02
2,1,1,3,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Strange Bedfellows,section001/subsection001/episode003.csv,3.0,1-1-03
3,1,1,4,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Attack on the Duergar Warcamp,section001/subsection001/episode004.csv,4.0,1-1-04
4,1,1,5,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,The Trick about Falling,section001/subsection001/episode005.csv,5.0,1-1-05
...,...,...,...,...,...,...,...,...,...
349,3,4,19,Campaign Three: Bells Hells,Campaign Three Arc 4,Where The Red Fearne Glows,section003/subsection004/episode019.csv,349.0,3-4-19
350,3,4,20,Campaign Three: Bells Hells,Campaign Three Arc 4,Gathering of Needs,section003/subsection004/episode020.csv,350.0,3-4-20
351,3,4,21,Campaign Three: Bells Hells,Campaign Three Arc 4,Shadows New and Old,section003/subsection004/episode021.csv,351.0,3-4-21
352,3,4,22,Campaign Three: Bells Hells,Campaign Three Arc 4,Ancient Sins,section003/subsection004/episode022.csv,352.0,3-4-22


In [38]:
all_transcripts = CollectTranscripts(
    campaign_db,
    relative_data_dir,
    {'campaign_no': 'campaign_no', 'arc_no': 'arc_no', 'episode_no': 'episode_no',
     'campaign': 'campaign', 'arc': 'arc', 'episode': 'episode',
     'episode_index': 'episode_index', 'episode_label': 'episode_label'}
)

In [39]:
transcripts = all_transcripts.loc[all_transcripts['section'].isin(['Part I', 'Part II']), :].reset_index(drop = True)
transcripts = transcripts.loc[transcripts['speaker'].isin(cast.keys()), :].reset_index(drop = True)
transcripts

Unnamed: 0,campaign_no,arc_no,episode_no,campaign,arc,episode,episode_index,episode_label,section_no,line_no,section,speaker,line,linelength,nwords
0,1,1,1,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Arrival at Kraghammer,1.0,1-1-01,2,1,Part I,MATT,All right! Let's jump on in. Thank you. Last w...,1793,320
1,1,1,1,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Arrival at Kraghammer,1.0,1-1-01,2,2,Part I,TRAVIS,Son of a bitch.,15,4
2,1,1,1,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Arrival at Kraghammer,1.0,1-1-01,2,3,Part I,MATT,"Yep, the barbarian for his first and only time...",215,41
3,1,1,1,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Arrival at Kraghammer,1.0,1-1-01,2,4,Part I,TRAVIS,Next time he dies.,18,4
4,1,1,1,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Arrival at Kraghammer,1.0,1-1-01,2,5,Part I,MATT,Yeah. Essentially. Which managed to not turn i...,213,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999120,3,4,23,Campaign Three: Bells Hells,Campaign Three Arc 4,The Nox Engine,353.0,3-4-23,4,1162,Part II,MATT,"It's next to that Ludinus stands, arms crossed...",2162,370
999121,3,4,23,Campaign Three: Bells Hells,Campaign Three Arc 4,The Nox Engine,353.0,3-4-23,4,1163,Part II,MATT,He reaches over to the cracked central dome an...,841,152
999122,3,4,23,Campaign Three: Bells Hells,Campaign Three Arc 4,The Nox Engine,353.0,3-4-23,4,1164,Part II,MATT,[cheering\n[laughter] \n,22,2
999123,3,4,23,Campaign Three: Bells Hells,Campaign Three Arc 4,The Nox Engine,353.0,3-4-23,4,1166,Part II,MATT,That's it. Thank you all so very much for join...,312,53


We're starting with nearly a million lines of dialogue, but let's be fairly strict. We only want episodes where the full cast is present, and we're only going to use lines that are at least 3 words long.

In [59]:
full_cast_episodes = transcripts \
    .groupby(['campaign_no', 'arc_no', 'episode_no', 'episode_index', 'episode_label']) \
    .agg({'speaker': lambda x: len(set(x))}) \
    .reset_index() \
    .loc[per_episode_cast['speaker'] == 8, 'episode_label']

llm_transcripts = transcripts[['campaign_no', 'arc_no', 'episode_no', 'episode_index',
                               'episode_label', 'section_no', 'line_no', 'speaker', 'line', 'nwords']] \
    .loc[transcripts['episode_label'] \
    .isin(list(full_cast_episodes)), :] \
    .reset_index(drop = True)
llm_transcripts = llm_transcripts.loc[llm_transcripts['nwords'] >= 3, :].reset_index(drop = True)
llm_transcripts

Unnamed: 0,campaign_no,arc_no,episode_no,episode_index,episode_label,section_no,line_no,speaker,line,nwords
0,1,1,4,4.0,1-1-04,2,1,MATT,"All right guys, so, getting to the game at han...",333
1,1,1,4,4.0,1-1-04,2,3,MATT,"Upon returning, the party had a plan to lead t...",67
2,1,1,4,4.0,1-1-04,2,4,ASHLEY,"Wait, you still have the magic carpet, right?",8
3,1,1,4,4.0,1-1-04,2,5,MATT,"Yeah, it got recovered. Barely.",5
4,1,1,4,4.0,1-1-04,2,6,MARISHA,"We got it back, it's okay.",6
...,...,...,...,...,...,...,...,...,...,...
464271,3,4,23,353.0,3-4-23,4,1161,MATT,"""Very well."" He adjusts his cape and his mantl...",160
464272,3,4,23,353.0,3-4-23,4,1162,MATT,"It's next to that Ludinus stands, arms crossed...",370
464273,3,4,23,353.0,3-4-23,4,1163,MATT,He reaches over to the cracked central dome an...,152
464274,3,4,23,353.0,3-4-23,4,1166,MATT,That's it. Thank you all so very much for join...,53


Here we see that we have come down to about 460,000 lines of dialogue to use for our models. Let's use a 80/10/10 train/test/validation split, stratified by episode and speaker.

In [60]:
from sklearn.model_selection import train_test_split

transcript_train, transcript_test = train_test_split(
    llm_transcripts,
    test_size = 0.2,
    random_state = 1618,
    stratify = llm_transcripts['episode_label']
)

transcript_test, transcript_validation = train_test_split(
    transcript_test,
    test_size = 0.5,
    random_state = 2718,
    stratify = transcript_test['episode_label']
)

print(f'The training data has {len(transcript_train)} rows.')
print(f'The test data has {len(transcript_test)} rows.')
print(f'The validation data has {len(transcript_validation)} rows.')

The training data has 371420 rows.
The test data has 46428 rows.
The validation data has 46428 rows.


In [61]:
transcript_train.to_csv('../../data/prepared_data/transcript_train.csv', index = False)
transcript_test.to_csv('../../data/prepared_data/transcript_test.csv', index = False)
transcript_validation.to_csv('../../data/prepared_data/transcript_validation.csv', index = False)

In [62]:
transcript_train

Unnamed: 0,campaign_no,arc_no,episode_no,episode_index,episode_label,section_no,line_no,speaker,line,nwords
198928,2,5,18,223.0,2-5-18,2,1054,LIAM,"Fjord, do you want to carry these, or do you w...",16
311934,3,1,19,274.0,3-1-19,2,1943,LAURA,"Oh, it's my glasses. Hold on. (laughter)",7
357196,3,2,17,295.0,3-2-17,4,710,MATT,"I'll allow it for the time being, yeah.",8
372744,3,2,25,303.0,3-2-25,4,468,MARISHA,You're more just visiting another landscape.,6
240360,2,6,15,241.0,2-6-15,4,13,TRAVIS,As a point of clarification--,5
...,...,...,...,...,...,...,...,...,...,...
400655,3,3,19,325.0,3-3-19,2,1258,MATT,"All right, finishing FCG's go.",5
290114,3,1,11,266.0,3-1-11,2,415,MATT,22 points of lightning damage.,5
271402,2,6,28,254.0,2-6-28,4,395,LIAM,But I'm still pretty low.,5
404106,3,3,20,326.0,3-3-20,2,1058,MATT,"All right, who's keeping watch?",5
