# The Office - Number of Lines

The `schrutepy` package provides the script for all seasons of The Office. 

[The Office - example analysis](https://nbviewer.jupyter.org/github/zgana/data-science-one-offs/blob/master/NLP-Analysis-of-The-Office.ipynb?flush_cache=true#Character-character-mentions)

In [1]:
import altair as alt
import pandas as pd

from schrutepy import schrutepy

alt.themes.enable("fivethirtyeight")

ThemeRegistry.enable('fivethirtyeight')

In [2]:
MAIN_CHARACTERS = [
    "Michael", "Jim", "Pam", "Dwight", 
    "Phyllis", "Stanley", "Meredith",
    "Oscar", "Angela", "Kevin",
    "Ryan", "Toby", "Kelly", 
]

def subset_data(full_dataset, MAIN_CHARACTERS):
    subset = full_dataset[(full_dataset.character.isin(MAIN_CHARACTERS)) & 
                          (full_dataset.text.notna())]
            
    return subset

In [3]:
full_dataset = schrutepy.load_schrute()
df = subset_data(full_dataset, MAIN_CHARACTERS)

In [4]:
# Aggregate and count the number of lines spoken by each character-season-episode
lines_by_ep = (df.groupby(["character", "season", "episode", "episode_name"])
               .agg({"text": "count"})
               .reset_index()
              )

lines_by_ep.head()

Unnamed: 0,character,season,episode,episode_name,text
0,Angela,1,1,Pilot,1
1,Angela,1,2,Diversity Day,4
2,Angela,1,3,Health Care,5
3,Angela,1,4,The Alliance,7
4,Angela,1,5,Basketball,3


In [5]:
# Aggregate and count number of lines spoken by each character-season
lines_by_season = (lines_by_ep.groupby(["character", "season"])
                   .agg({"text": "sum"})
                   .reset_index()
                  )

lines_by_season.head()

Unnamed: 0,character,season,text
0,Angela,1,23
1,Angela,2,133
2,Angela,3,197
3,Angela,4,166
4,Angela,5,206


In [6]:
# Aggregate and count number of lines spoken by each character
lines = (lines_by_season.groupby(["character"])
         .agg({"text": "sum"})
         .reset_index()
        )

lines.head()

Unnamed: 0,character,text
0,Angela,1551
1,Dwight,6801
2,Jim,6268
3,Kelly,835
4,Kevin,1551


## Visualize: Lines Spoken

In [7]:
def make_line_chart_by_season(df, character="Michael"):
    chart = (alt.Chart(df[df.character==character])
             .mark_line()
             .encode(
                 x=alt.X("season:Q", title="season"),
                 y=alt.Y("sum(text):Q", title="# lines"),
                 color="character:N"
             ).properties(title=f"Number of Lines {character} Had", 
                         width = 350, height = 250)
            )
    
    display(chart)
    return chart

In [8]:
angela = make_line_chart_by_season(lines_by_ep, character="Angela")
dwight = make_line_chart_by_season(lines_by_ep, character="Dwight")
oscar = make_line_chart_by_season(lines_by_ep, character="Oscar")

In [9]:
# Layer the charts (overlay, or put 1 on top of another)
((angela + dwight + oscar)
 .properties(title="Number of Lines: Angela vs Dwight vs Oscar")
 .configure(background="white")
)

In [10]:
# Concatenate horizontally
(alt.hconcat(angela, dwight, oscar)
 # What does commenting/uncommenting the next line do?
 #.properties(title="Number of Lines: Angela vs Dwight vs Oscar")
 .configure(background="white")
)

In [11]:
# Concatenate vertically
(alt.vconcat(angela, dwight, oscar)
 # What does commenting/uncommenting the next line do?
 .properties(title="Number of Lines: Angela vs Dwight vs Oscar")
 .configure(background="white")
)

In [12]:
(alt.Chart(lines)
 .mark_bar()
 .encode(
     x="character:N",
     y="sum(text)",
     color="character:N",
 ).properties(title="Number of Lines by Character")
 .configure(background="white")
)

In [13]:
(alt.Chart(lines)
 .mark_bar()
 .encode(
     x=alt.X("character:N", 
            sort=alt.EncodingSortField(field="text", op="sum", order="descending")
            ),
     y=alt.Y("sum(text)", title = "# lines"),
     color=alt.Color("character:N",
                     sort=alt.EncodingSortField(field="text", op="sum", order="descending")
                    )
 ).properties(title="Number of Lines by Character")
 .configure(background="white")
 .configure_title(fontSize=14, anchor="middle")
)

## Try it!

Pick other characters by changing the `character` parameter in `make_line_chart_by_season()`.

Find ways to combine 2+ charts.