In [1]:
from collections import Counter
import pandas as pd

## Note:

First, pre-process the raw data with the **preprocess.py** script (see root directory). This notebook makes use of the resulting pkl file.

## Load data

In [2]:
df = pd.read_pickle('../data/all_data.pkl')
df.shape

(2427, 4)

In [3]:
df.head()

Unnamed: 0,movieId,plot,tag,tokens
0,1,A little boy named Andy loves to be in his roo...,animation,"[little, boy, named, andy, loves, room, playin..."
1,2,When two kids find and play a magical board ga...,fantasy,"[two, kids, find, play, magical, board, game, ..."
2,3,Things don't seem to change much in Wabasha Co...,comedy,"[things, seem, change, much, wabasha, county, ..."
3,6,Hunters and their prey--Neil and his professio...,action,"[hunters, prey, neil, professional, criminal, ..."
4,7,An ugly duckling having undergone a remarkable...,romance,"[ugly, duckling, undergone, remarkable, change..."


## Class distribution

In [4]:
df.tag.value_counts()

comedy       780
action       437
romance      380
sci-fi       352
animation    283
fantasy      195
Name: tag, dtype: int64

## Text lenghts

In [5]:
df['n_tokens'] = df.tokens.apply(len)
df.groupby('tag').n_tokens.mean().round(1)

tag
action       38.5
animation    37.0
comedy       37.1
fantasy      42.1
romance      42.7
sci-fi       39.0
Name: n_tokens, dtype: float64

## Most frequent words

In [6]:
def get_wordcounts(df, tag, topn=10):
    counts = Counter()
    lists_of_tokens = df.query(f"tag == @tag").tokens
    for tokenlist in lists_of_tokens:
        for token in tokenlist:
            counts[token] += 1
    cols = pd.MultiIndex.from_product([[tag], ['token', 'count']])
    return pd.DataFrame(counts.most_common(topn), columns=cols)

In [7]:
def make_topn_table(df, topn=10):
    return pd.concat(
        [get_wordcounts(df, tag, topn=topn) for tag in df.tag.unique()], axis=1
    )
    
table = make_topn_table(df)
table

Unnamed: 0_level_0,animation,animation,fantasy,fantasy,comedy,comedy,action,action,romance,romance,sci-fi,sci-fi
Unnamed: 0_level_1,token,count,token,count,token,count,token,count,token,count,token,count
0,world,68,world,53,life,177,one,96,love,160,earth,112
1,young,63,one,41,one,160,man,79,life,138,world,86
2,new,51,must,41,new,153,new,73,one,94,one,84
3,named,45,young,40,man,125,life,68,young,87,alien,77
4,one,45,life,40,two,121,must,66,new,74,planet,76
5,must,42,evil,38,get,119,find,63,woman,71,new,71
6,get,39,king,34,find,102,agent,58,man,69,future,68
7,two,39,father,34,family,94,world,54,two,67,find,66
8,life,37,family,31,father,94,young,53,time,56,human,52
9,boy,36,find,30,young,88,team,50,finds,55,life,49


In [8]:
# convert table to latex format

latex = table.to_latex(index=False, multicolumn=True)
caption = "Top 10 tokens by genre"
label = "tab:top_10_tokens"

super_latex = (
    "\\begin{table}\n"
    f"\caption{{{caption}}}\n"
    f"\label{{tab:{label}}}\n"
    "\makebox[\\textwidth][c]{\n"
    f"{latex}"
    "}\n"
    "\end{table}\n"
)

print(super_latex)

\begin{table}
\caption{Top 10 tokens by genre}
\label{tab:tab:top_10_tokens}
\makebox[\textwidth][c]{
\begin{tabular}{lrlrlrlrlrlr}
\toprule
animation & \multicolumn{2}{l}{fantasy} & \multicolumn{2}{l}{comedy} & \multicolumn{2}{l}{action} & \multicolumn{2}{l}{romance} & \multicolumn{2}{l}{sci-fi} \\
    token & count &   token & count &   token & count &  token & count &   token & count &   token & count \\
\midrule
    world &    68 &   world &    53 &    life &   177 &    one &    96 &    love &   160 &   earth &   112 \\
    young &    63 &     one &    41 &     one &   160 &    man &    79 &    life &   138 &   world &    86 \\
      new &    51 &    must &    41 &     new &   153 &    new &    73 &     one &    94 &     one &    84 \\
    named &    45 &   young &    40 &     man &   125 &   life &    68 &   young &    87 &   alien &    77 \\
      one &    45 &    life &    40 &     two &   121 &   must &    66 &     new &    74 &  planet &    76 \\
     must &    42 &    evil & 