# First aproach & Data exploration

In [247]:
!pip3 install pandas




[notice] A new release of pip available: 22.2.2 -> 22.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [248]:
import pandas as pd
import re

In [249]:
csv_file = "./the_office_lines_scripts.csv"
json_file = "./stopwords.json"

In [250]:
scripts_df = pd.read_csv(csv_file, encoding="utf_8")
scripts_df

Unnamed: 0,id,season,episode,scene,line_text,speaker,deleted
0,1,1,1,1,All right Jim. Your quarterlies look very good...,Michael,False
1,2,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,False
2,3,1,1,1,So you've come to the master for guidance? Is ...,Michael,False
3,4,1,1,1,"Actually, you called me in here, but yeah.",Jim,False
4,5,1,1,1,"All right. Well, let me show you how it's done.",Michael,False
...,...,...,...,...,...,...,...
59904,59905,9,23,112,It all seems so very arbitrary. I applied for ...,Creed,False
59905,59906,9,23,113,I just feel lucky that I got a chance to share...,Meredith,False
59906,59907,9,23,114,I���m happy that this was all filmed so I can ...,Phyllis,False
59907,59908,9,23,115,I sold paper at this company for 12 years. My ...,Jim,False


In [251]:
scripts_df.describe()

Unnamed: 0,id,season,episode,scene
count,59909.0,59909.0,59909.0,59909.0
mean,29955.0,5.348178,11.558597,27.143852
std,17294.382975,2.389427,6.986208,17.860616
min,1.0,1.0,1.0,1.0
25%,14978.0,3.0,5.0,14.0
50%,29955.0,5.0,11.0,25.0
75%,44932.0,7.0,18.0,37.0
max,59909.0,9.0,26.0,116.0


In [252]:
stop_words_df = pd.read_json(json_file)
stop_words_df

Unnamed: 0,0
0,a
1,about
2,above
3,after
4,again
...,...
779,thorough
780,thoroughly
781,three
782,well


In [253]:
stop_words_df.describe()

Unnamed: 0,0
count,784
unique,781
top,keeps
freq,2


# Main data

For an easy acces of the information we can transform the scripts_df in three main variables "characters_data": Wich is a dictionary capable of telling all the seasons/episodes in wich a character was present, it also has a place for season metadata to solve the tasks of this option; "said_jokes": A list of the posible scripts_df positions with a "That´s what she said" joke in the line_text column; "season_episodes": A dictionary with the seasons as keys and a list of episodes as data.


characters_data format = { 
    character_name(string): { 
        season_number(integer):{ 
            "episodes_in_season": [int, int,...], 
            "metadata_of_the_season": { 
                to be determined 
            } 
        } 
    } 
} 

In [254]:
def add_character(name, data):
    if name not in data.keys():
        data[name] = {}
        
def add_season(season, name, data, seasons_episodes):
    if season not in data[name].keys():
        data[name][season] = {
            "episodes": [],
            "words": {},
            "n_words": 0,
            "n_lines": 0,
            "lines_per_ep": {}
        }
    if season not in seasons_episodes.keys():
        seasons_episodes[season] = {}
        
def add_episode(episode, season, name, data, seasons_episodes):
    if episode not in data[name][season]["episodes"]:
        data[name][season]["episodes"].append(episode)
        data[name][season]["lines_per_ep"][episode] = 0
    if episode not in seasons_episodes[season].keys():
        seasons_episodes[season][episode] = 1
    else:
        seasons_episodes[season][episode] += 1

In [255]:
def generate_main_data(scripts, stop):
    characters_data = {}
    seasons_episodes = {}
    stop_words = []
    for row in scripts.itertuples():
        add_character(row[6], characters_data)
        add_season(row[2], row[6], characters_data, seasons_episodes)
        add_episode(row[3], row[2], row[6], characters_data, seasons_episodes)

    for row in stop.itertuples():
        stop_words.append(re.sub('[^A-Za-z0-9]+', '', row[1]))
    
    return characters_data, seasons_episodes, stop_words

# Tasks

## Create metadata

In [256]:
def insert_word(word, stop_words, season_data):
    if word not in stop_words:
        if word in season_data["words"].keys():
            season_data["words"][word] += 1
            season_data["n_words"] += 1
        else:
            season_data["words"][word] = 1
            season_data["n_words"] += 1

def count_line(season_data, episode):
    season_data["n_lines"] += 1
    season_data["lines_per_ep"][episode] += 1

characters_data, season_episodes, stop_words = generate_main_data(scripts_df, stop_words_df)

for row in scripts_df.itertuples():
    season_data = characters_data[row[6]][row[2]]
    count_line(season_data, row[3])
    for word in row[5].strip().split():
        insert_word(re.sub('[^A-Za-z0-9]+', '', word), stop_words, season_data)

In [257]:
for row in scripts_df.itertuples():
    if "[" in row[6]:
        print(row[0])

20936
23024
23026
23762
23765
23856
23858
23860
23862
23940
40236
40244
44975
44976
53563
53564
53572


## Questions

### How many characters? What are their names?

In [258]:
n_characters = len(characters_data.keys())

with open('names.txt', 'w') as names_output:
    names_output.write(f"{n_characters} characters in this file.\n\n")
    count = 1
    for name in characters_data.keys():
        names_output.write(str(count) + " -> " + name + "\n")
        count += 1

print(f"there are {n_characters} characters in the series, their names are in \"names.txt\" file")

there are 793 characters in the series, their names are in "names.txt" file


### For each character, find out who has the most lines across all episodes

In [259]:
most_lines = ["", 0]
for name in characters_data.keys():
    total_lines = 0
    for season in characters_data[name].keys():
        total_lines += characters_data[name][season]["n_lines"]
    if total_lines > most_lines[1]:
        most_lines[0] = name
        most_lines[1] = total_lines
print(f"The charecter with the most lines across all the episodes is {most_lines[0]}, with {most_lines[1]} lines.")

The charecter with the most lines across all the episodes is Michael, with 12140 lines.


### What is the average of words per line for each character?

In [260]:
with open('avg_words_per_line.csv', 'w') as avg_words_output:
    avg_words_output.write(f"Name, avg_words_per_line\n")
    for name in characters_data.keys():
        words = 0
        lines = 0
        for season in characters_data[name].keys():
            words += characters_data[name][season]["n_words"]
            lines += characters_data[name][season]["n_lines"]
        avg_words_output.write(f"{name},{words//lines}\n")

print("The answer is in avg_words_per_line.csv")

The answer is in avg_words_per_line.csv


### What is the most common word per character

In [261]:
def most_common_word(words):
    common = ["", 0]
    for key in words.keys():
        if words[key] > common[1]:
            common[0] = key
            common[1] = words[key]
    return common

with open('common_words.csv', 'w') as common_words_output:
    common_words_output.write("name,most_common_word\n")
    for name in characters_data.keys():
        words_dict = {}
        for season in characters_data[name].keys():
            for key in characters_data[name][season]["words"].keys():
                if key not in words_dict.keys():
                    words_dict[key] = characters_data[name][season]["words"][key]
                else:
                    words_dict[key] += characters_data[name][season]["words"][key]
        words_dict = {k: v for k, v in sorted(words_dict.items(), key=lambda item: item[1], reverse=True)}
        common = most_common_word(words_dict)
        common_words_output.write(name + "," + common[0] + "\n")


print("The answer is in common_words.csv")    

The answer is in common_words.csv


### Number of episodes where the character does not have a line, for each character

In [262]:
with open('episodes_without_lines.csv', 'w') as no_lines_output:
    no_lines_output.write("name,number_of_episodes\n")
    for name in characters_data.keys():
        no_lines = 0
        for season in characters_data[name].keys():
            no_lines += (len(season_episodes[season].keys()) - len(characters_data[name][season]["episodes"]))
        no_lines_output.write(name + "," + str(no_lines) + "\n")

print("The answer is in episodes_without_line.csv")

The answer is in episodes_without_line.csv


### Number of times "That's what she said" joke comes up & five examples

In [263]:
def get_posible_jokes(joke, scripts_data):
    posible = []
    joke = "That's what she said"
    for row in scripts_data.itertuples():
            if joke.lower() in row[5].lower():
                posible.append(row[0])
    return posible

posible_jokes = get_posible_jokes("That's what she said", scripts_df)
print(f"There are {len(posible_jokes)} posible instances of \"That´s what she said\" jokes.")


with open('jokes.txt', 'w') as jokes_output:
    count = 1
    for i in range(2, 8):
        jokes_output.write(f"Example {count}:\n")
        count += 1
        idx = posible_jokes[i] - 1
        while idx <= posible_jokes[i] + 1:
            jokes_output.write(scripts_df["speaker"].iloc[idx] + ": " + scripts_df["line_text"].iloc[idx] + "\n")
            idx += 1
        jokes_output.write("\n")

print("The examples are in jokes.txt")

There are 37 posible instances of "That´s what she said" jokes.
The examples are in jokes.txt


### The average percent of lines each character contributed to each episode per season

In [264]:
with open("lines_per_episode.csv", "w") as lines_per_episode:
    lines_per_episode.write("name, season, episode, percentage_spoken\n")
    for name in characters_data.keys():
            for season in characters_data[name].keys():
                for ep in characters_data[name][season]["episodes"]:
                    spoken = characters_data[name][season]["lines_per_ep"][ep]
                    total = season_episodes[season][ep]
                    lines_per_episode.write(name + "," + str(season) + "," + str(ep) + "," + str(spoken*100/total) + "\n")

print("The answer is in lines_per_episode.csv")

The answer is in lines_per_episode.csv


## 3 questions inveted by me

## Additional questions

### What are the most critical challenges for Adara, related to data science?

I belive.

### Why did you choose this assignment instead of option 1?

I choose this asignment because its been a long time since Y was able to do "data analysis", so I wanted to see how I was doing and what I remembered from clases (Data analysis was one of my two minors, but because of curriculum problems I had to drop it). Also when people is trying to apply for a computer science job, usually, there's a Frontend/Backend assignment to solve and since I have been applying to different jobs I wanted to do something different to get out of the routine and to remember past knowledge (specialy still being in University and working in Frontend or Backend in almost all my courses).