# The Office 

The `schrutepy` package provides the script for all seasons of The Office. 

[The Office - example analysis](https://nbviewer.jupyter.org/github/zgana/data-science-one-offs/blob/master/NLP-Analysis-of-The-Office.ipynb?flush_cache=true#Character-character-mentions)

In [1]:
import altair as alt
import pandas as pd

from schrutepy import schrutepy

alt.themes.enable("fivethirtyeight")

ThemeRegistry.enable('fivethirtyeight')

In [2]:
MAIN_CHARACTERS = [
    "Michael", "Jim", "Pam", "Dwight", 
    "Phyllis", "Stanley", "Meredith",
    "Oscar", "Angela", "Kevin",
    "Ryan", "Toby", "Kelly", 
]

def subset_data(full_dataset, MAIN_CHARACTERS):
    subset = full_dataset[(full_dataset.character.isin(MAIN_CHARACTERS)) & (full_dataset.text.notna())]
    
    subset = subset.assign(
        keep = subset.apply(lambda x: True if any(c for c in MAIN_CHARACTERS if c in 
                                                  x.text) else False, axis=1)
    )

    df = subset[subset.keep==1].reset_index(drop=True)

    for c in MAIN_CHARACTERS:
        df[c] = df.apply(lambda x: 1 if c in x.text else 0, axis=1)
        
    return df

In [3]:
full_dataset = schrutepy.load_schrute()
df = subset_data(full_dataset, MAIN_CHARACTERS)

In [4]:
# This looks like our standard df
# This df is "wide"
# Each character-season takes up 1 row, and who/how many times that character talks to is stored in columns
mentions_by_season = df.pivot_table(
    index=["character", "season"], 
    values=MAIN_CHARACTERS, 
    aggfunc="sum").reset_index()

mentions_by_season.head()

Unnamed: 0,character,season,Angela,Dwight,Jim,Kelly,Kevin,Meredith,Michael,Oscar,Pam,Phyllis,Ryan,Stanley,Toby
0,Angela,1,0,0,0,0,2,1,0,0,0,1,0,0,0
1,Angela,2,1,3,1,0,1,0,3,0,1,2,0,0,1
2,Angela,3,1,12,0,3,4,3,6,2,3,5,0,0,0
3,Angela,4,1,6,2,1,2,2,2,1,2,3,0,0,2
4,Angela,5,1,6,0,1,9,1,7,4,0,2,0,0,1


In [5]:
# Compare how this df looks with the previous one
# This df is "long"
# Each character-season-talked is a row, and the number of times is stored in the column
mentions_by_season_long = mentions_by_season.melt(id_vars=["character", "season"], 
                                                  var_name="talks_to")
mentions_by_season_long.head()

Unnamed: 0,character,season,talks_to,value
0,Angela,1,Angela,0
1,Angela,2,Angela,1
2,Angela,3,Angela,1
3,Angela,4,Angela,1
4,Angela,5,Angela,1


In [6]:
# Maybe we want to know how many mentions there are across ALL seasons
mentions_long = (mentions_by_season_long.groupby(["character", "talks_to"])
                 .agg({"value": "sum"})
                 .reset_index()
                )

mentions_long.head()

Unnamed: 0,character,talks_to,value
0,Angela,Angela,9
1,Angela,Dwight,73
2,Angela,Jim,11
3,Angela,Kelly,8
4,Angela,Kevin,38


## Visualize: Character Mentions

In [7]:
def make_bar_chart_by_season(df, character="Michael"):
    chart = (alt.Chart(df[df.character==character])
             .mark_bar()
             .encode(
                 x=alt.X("talks_to:N", title="character",
                         sort=alt.EncodingSortField(field="talks_to", order="ascending"),
                        axis=alt.Axis(labelAngle=-90)),
                 y=alt.Y("sum(value):Q", title="# times"),
                 column="season:O",
                 color="talks_to:N",
             ).properties(title=f"Who does {character} talk to the most?")
             .configure(background="white")
            )
    
    display(chart)

In [8]:
make_bar_chart_by_season(mentions_by_season_long, character="Michael")

In [9]:
make_bar_chart_by_season(mentions_by_season_long[mentions_by_season_long.season==9], character="Dwight")

In [10]:
def make_bar_chart(df, character="Michael"):
    chart = (alt.Chart(df[df.character==character])
             .mark_bar()
             .encode(
                 x=alt.X("talks_to:N", title="character",
                         sort=alt.EncodingSortField(field="value", order="descending"),
                        axis=alt.Axis(labelAngle=-90)),
                 y=alt.Y("sum(value):Q", title="# times"),
                 color=alt.Color("talks_to:N", title="")
             ).properties(title=f"Who does {character} talk to the most?")
             .configure(background="white")
            )
    
    display(chart)

In [11]:
make_bar_chart(mentions_long, character="Jim")

## Try it!

Pick another character and see who they talk to the most each season by changing the `character` parameter in `make_bar_chart_by_season()`.

Pick another character and see who they talk to the most across all the seasons by changing the `character` parameter in `make_bar_chart()`.