In [6]:

import json
import re
import asyncio
import nest_asyncio
import pandas as pd
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
nest_asyncio.apply()
import sys
sys.path.append('../')
from utils.llm import CustomLLM


# 1. Data Processing

In [7]:
#Function to read lines from a file
def read_file(file_name):
    with open(file_name, 'r') as file:
        return file.read().splitlines()

In [8]:
all_data = read_file('data/friends_dataset.jsonl')

In [9]:
print(len(all_data))

67373


In [10]:
#Intermediary dataframe to store the json elements
df_inter = pd.DataFrame(all_data)
df_inter.columns = ['json_element']

In [11]:
data_df = pd.json_normalize(df_inter['json_element'].apply(json.loads))

In [12]:
data_df.head()

Unnamed: 0,conversation_id,season,episode,scene,utterance_id,text,speaker
0,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u001,There's nothing to tell! He's just some guy I ...,Monica Geller
1,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u002,"C'mon, you're going out with the guy! There's ...",Joey Tribbiani
2,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u003,"All right Joey, be nice. So does he have a hum...",Chandler Bing
3,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u004,"Wait, does he eat chalk?",Phoebe Buffay
4,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u005,,TRANSCRIPT_NOTE


In [13]:
#Drop rows that are not dialogues
data_df['text'] = data_df['text'].str.strip()
data_df = data_df[data_df['text'].str.len() > 0]
data_df.head()

Unnamed: 0,conversation_id,season,episode,scene,utterance_id,text,speaker
0,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u001,There's nothing to tell! He's just some guy I ...,Monica Geller
1,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u002,"C'mon, you're going out with the guy! There's ...",Joey Tribbiani
2,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u003,"All right Joey, be nice. So does he have a hum...",Chandler Bing
3,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u004,"Wait, does he eat chalk?",Phoebe Buffay
5,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u006,"Just, 'cause, I don't want her to go through w...",Phoebe Buffay


In [14]:
#Grouping speaker with their respective dialogues
data_df['script'] = data_df['speaker'] + ': ' + data_df['text']
#Drop speaker and text columns
data_df.drop(columns=['speaker', 'text'], inplace=True)
data_df.head()

Unnamed: 0,conversation_id,season,episode,scene,utterance_id,script
0,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u001,Monica Geller: There's nothing to tell! He's j...
1,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u002,"Joey Tribbiani: C'mon, you're going out with t..."
2,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u003,"Chandler Bing: All right Joey, be nice. So doe..."
3,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u004,"Phoebe Buffay: Wait, does he eat chalk?"
5,s01_e01_c01_u001,s01,e01,c01,s01_e01_c01_u006,"Phoebe Buffay: Just, 'cause, I don't want her ..."


In [15]:
#Grouping script by conversation_id, season,episode, and scene as list of strings
data_df = data_df.groupby(['conversation_id', 'season', 'episode', 'scene'])['script'].apply(list).reset_index()
data_df.head()

Unnamed: 0,conversation_id,season,episode,scene,script
0,s01_e01_c01_u001,s01,e01,c01,[Monica Geller: There's nothing to tell! He's ...
1,s01_e01_c02_u001,s01,e01,c02,[Monica Geller: Now I'm guessing that he bough...
2,s01_e01_c03_u001,s01,e01,c03,[Phoebe Buffay: Love is sweet as summer shower...
3,s01_e01_c04_u001,s01,e01,c04,[Ross Geller: I'm supposed to attach a bracket...
4,s01_e01_c05_u001,s01,e01,c05,"[Monica Geller: Oh my God!, Paul the Wine Guy:..."


# 2. Data Analysis

In [16]:
data_df.head()

Unnamed: 0,conversation_id,season,episode,scene,script
0,s01_e01_c01_u001,s01,e01,c01,[Monica Geller: There's nothing to tell! He's ...
1,s01_e01_c02_u001,s01,e01,c02,[Monica Geller: Now I'm guessing that he bough...
2,s01_e01_c03_u001,s01,e01,c03,[Phoebe Buffay: Love is sweet as summer shower...
3,s01_e01_c04_u001,s01,e01,c04,[Ross Geller: I'm supposed to attach a bracket...
4,s01_e01_c05_u001,s01,e01,c05,"[Monica Geller: Oh my God!, Paul the Wine Guy:..."


In [17]:
#Check how many utterances are in each conversation
data_df['script'].apply(len).describe()

count    3099.000000
mean       19.783801
std        16.273398
min         1.000000
25%        10.000000
50%        17.000000
75%        25.000000
max       255.000000
Name: script, dtype: float64

In [18]:
#Check how many words are in each conversation
data_df['script'].apply(lambda x: [len(i.split()) for i in x]).apply(pd.Series).sum(axis=1).describe()

count    3099.000000
mean      239.605679
std       195.664833
min         2.000000
25%       123.000000
50%       202.000000
75%       306.000000
max      3820.000000
dtype: float64

# 3. Finetuning Data Dump

In [20]:
def dump_to_file(data,file_name='data/finetune_data.txt'):
    with open(file_name, 'w') as file:
        data = data+'\n'
        file.write(data)

In [21]:
for conversation_list in data_df['script']:
    conversation = ' '.join(conversation_list)
    dump_to_file(conversation)

# 4. Summary generation per conversation (for finetuning on summary instead of dialogues)


**NOTE: Opted not to use this approach due to unsatisfactory results.**

In [24]:
llm = CustomLLM()

In [25]:
print(llm)

[1mCustomLLM[0m
Params: {'api_url': 'http://localhost:8080/completion', 'max_new_tokens': 1024, 'top_p': 0.9, 'temperature': 0, 'repetition_penalty': 1.5}


In [26]:
async def get_conversation_summary(conversation, conversation_id):
    #conversation = '\n'.join(conversation)
    #text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    
    conversation_chunks = [''.join(conversation[i:i+10]) for i in range(0, len(conversation), 10)]

    documents =  []

    for item in range(len(conversation_chunks)):
        page = Document(page_content=conversation_chunks[item],metadata = {'conversation_id': conversation_id})
        documents.append(page)


    map_prompt_template = """
    [INST]
    <<SYS>>You are expert in English Professor. You are objective, factual and you avoid any bias or interpretation.<</SYS>>
    Reword the given conversation in not more than five sentences, covering all the points.
    CONVERSATION:```{text}```
    [/INST]
    Sure, here is reworded conversation:\n
    """

    combine_prompt_template = """
    [INST]
    <<SYS>>You are an expert English Professor. You are objective, factual and you avoid any bias or interpretation.<</SYS>>
    Reword this conversation in less than 1000 words, covering all the most important points.
    CONVERSATION:```{text}```
    [/INST]
    Sure, here is reworded conversation:\n
    """

    map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])


    combine_prompt = PromptTemplate(
        template=combine_prompt_template, input_variables=["text"]
    )

    map_reduce_chain = load_summarize_chain(
    llm,
    chain_type="map_reduce",
    map_prompt=map_prompt,
    combine_prompt=combine_prompt
    )

    output = await map_reduce_chain.arun(documents)
    return output

In [27]:
async def process_all_conversations(all_conversations,all_conversations_ids):
    counter = 0
    for conversation,conversation_id in zip(all_conversations,all_conversations_ids):
        counter += 1
        summary_task = get_conversation_summary(conversation,conversation_id)
        tasks = [summary_task]
        results = await asyncio.gather(*tasks)

        for result in results:
            result = result.replace('\n', "")
            result = re.sub(' +', ' ', result)
            result = result.strip()
            json_data = {'conversation_id': conversation_id, 'summary': result}
            print(json_data)
            dump_to_file(json.dumps(json_data),file_name='data/summary_data.jsonl')
        
        if counter == 3:
           break

In [29]:
all_conversations = data_df['script'].tolist()
all_conversations_ids = data_df['conversation_id'].tolist()
asyncio.run(process_all_conversations(all_conversations,all_conversations_ids))

{'conversation_id': 's01_e01_c01_u001', 'summary': "Monica Geller and Joey Tribbiani were discussing her recent date with a coworker, while Chandler Bing joked about the man having a hump or hairpiece. Phoebe Buffay asked if he ate chalk, and shared her own experience with an ex-boyfriend named Carl who had a similar habit. Monica clarified that it was not a date and they were just going out to dinner as friends. Chandler added that it sounded like a date to him, and the group discussed their experiences with awkward dreams. Joey Tribbiani expressed his inability to recall a dream he had, which Chandler confirmed. Then, unexpectedly, the phone rang, causing everyone to look at him. Monica found it strange that they were all staring at him beforehand. Chandler joked about having a hump or hairpiece like the man Monica was dating. Ross Geller expressed his desire to be married again and Chandler wished for a million dollars. Monica greeted Rachel Green, another Lincoln High School alumnu