In [35]:
import os
import pandas as pd
import numpy as np
import openai
import tiktoken

API_KEY_ENV = "OPENAI_API_KEY"
MODEL = 'gpt-3.5-turbo'

openai.api_key = os.getenv(API_KEY_ENV)
encoding = tiktoken.encoding_for_model(MODEL)

file = os.path.join("data", "wiki_movie_plots_deduped.csv")
types = {
    'Release Year': 'int',
    'Title': 'string',
    'Origin/Ethnicity': 'string',
    'Director': 'string',
    'Cast': 'string',
    'Genre': 'string',
    'Plot': 'string'
}
df = pd.read_csv(file, dtype=types) \
    .drop('Wiki Page', axis='columns', inplace=False) 
df.shape

(34886, 7)

In [36]:
# Count characters in Plot column
df['plot_characters'] = df['Plot'].apply(len)
# Count tokens in Plot Column
df['plot_token_count'] = df['Plot'].apply(lambda plot: len(encoding.encode(plot)))
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Plot,plot_characters,plot_token_count
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,"A bartender is working at a saloon, serving dr...",500,105
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,"The moon, painted with a smiling face hangs ov...",437,101
2,1901,The Martyred Presidents,American,Unknown,,unknown,"The film, just over a minute long, is composed...",436,100
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,Lasting just 61 seconds and consisting of two ...,890,193
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,The earliest known adaptation of the classic f...,732,165


In [38]:
df = df.query('plot_token_count <= 4000')
df.sort_values(by = ['plot_token_count', 'Release Year'], ascending = False, inplace=True)
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Plot,plot_characters,plot_token_count
23223,1987,Sworn Brothers,Hong Kong,David Lai,"Andy Lau, Cheung Kwok Keung",crime,"When Lam Ting-yat was little, his father died ...",16636,3675
1897,1935,Grand Exit,American,Erle C. Kenton,"Edmund Lowe, Ann Sothern",mystery,The front page of a brochure depicts a globe w...,15709,3639
3009,1941,Broadway Limited,American,Gordon Douglas,"Victor McLaglen, Patsy Kelly, ZaSu Pitts",comedy,"Following the screening of her latest film ""Th...",16517,3595
3592,1943,Isle of Forgotten Sins,American,Edgar G. Ulmer,"Gale Sondergaard, John Carradine",adventure,Somewhere on one of the English-speaking South...,15046,3546
33726,2013,Detective Conan: Private Eye in the Distant Sea,Japanese,Kobun Shizuno,Minami Takayama,unknown,"At Maizuru Bay, on the dawn of April 20, a coa...",15763,3534


In [39]:
plot1 = df.loc[df.index[1], 'Plot']
plot1

'The front page of a brochure depicts a globe with ribbons stating "INTEROCEANIC FIRE INSURANCE CO." and, underneath, "estd. 1872" and, still below, "OFFICES IN ALL PRINCIPAL CITIES". Turning to page 6., one sees a photograph of a building described above as "EXCELSIOR PAPER COMPANY" and below as "Insured by Interoceanic Fire Insurance Co." As a blazing inferno collapses the building in the photo, a policeman moves away the onlookers and one of them (Edmund Lowe) takes a few steps aside to find a woman (Ann Sothern), standing on a pile of crates, observing the fire. As he looks up at her, she says, "I wonder how it started". He responds by providing a bantering discourse on "two hundred and two ways of starting a fire", illustrating it by striking a match on his teeth. He then offers her a cigarette, then invites her for a beer ("nice cold beer is very nice after a hot fire"), then "some food" and, finally, "a little stroll through the park". She refuses all offers ("you seem to think 

In [42]:
result = openai.ChatCompletion.create(model = MODEL, messages = [{'role':'user', 'content': "summarize the following movie plot in four sentences or less: {}".format(plot1)}])

In [45]:
print("LLM Summary\n============\n{}\n\nWikipedia plot\n============\n{}".format(result.choices[0].message.content, plot1))

LLM Summary
The movie plot revolves around a series of fires being set by an arsonist, targeting a fire insurance company. Tom Fletcher, a skilled insurance investigator, is brought in to solve the case. As he investigates the fires, he is joined by a woman named Adrienne, who has her own connection to the case. In the end, they discover the true identity of the arsonist and bring them to justice.

Wikipedia plot
The front page of a brochure depicts a globe with ribbons stating "INTEROCEANIC FIRE INSURANCE CO." and, underneath, "estd. 1872" and, still below, "OFFICES IN ALL PRINCIPAL CITIES". Turning to page 6., one sees a photograph of a building described above as "EXCELSIOR PAPER COMPANY" and below as "Insured by Interoceanic Fire Insurance Co." As a blazing inferno collapses the building in the photo, a policeman moves away the onlookers and one of them (Edmund Lowe) takes a few steps aside to find a woman (Ann Sothern), standing on a pile of crates, observing the fire. As he looks