In [2]:
import polars as pl
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

# Load Moby Dick
moby_dick = load_etext(2701, mirror='https://gutenberg.pglaf.org/')
moby_dick = strip_headers(moby_dick).strip()

sherlock = load_etext(1661, mirror='https://gutenberg.pglaf.org/')
sherlock = strip_headers(sherlock).strip()

scarlet = load_etext(244, mirror='https://gutenberg.pglaf.org/')
scarlet = strip_headers(scarlet).strip()

df = pl.LazyFrame(
    {
        'index': pl.arange(3, eager=True),
        'authors': ['Hermann Melville', 'Arthur Conan Doyle', 'Arthur Conan Doyle'],
        'books': ['Moby Dick', 'The Adventures of Sherlock Holmes', 'A Study in Scarlet'],
        'texts': [moby_dick, sherlock, scarlet],
    }
)

STOPWORDS = {
    "he", "her", "hers", "herself", "him", "himself", "his", "i", "me", "mine", "my", "myself", 
    "our", "ours", "ourselves", "she", "thee", "their", "them", "themselves", "they", "thou", 
    "thy", "thyself", "us", "we", "ye", "you", "your", "yours", "yourself"
}

# Tokenise
df = df.with_columns(
    pl.col('texts')
    .str.replace_all("[`']", '')
    .str.extract_all('\w+')
    .list.eval(
        pl.element().filter(
            ~pl.element().is_in(STOPWORDS) | ~pl.element().str.contains('.*\d+.*')
        )
    )
    .alias('tokens')
)

In [16]:
row_token_freqs = (
    df
    .explode('tokens')
    .group_by(['index', 'tokens'])
    .agg(pl.len().alias('frequency'))
)

top_k_tokens = (
    row_token_freqs
    .group_by('tokens')
    .agg(pl.sum('frequency').alias('token_freqs'))
    .sort('token_freqs', descending=True)
    .limit(500)
    .select('tokens')
)

z_scores = (
    row_token_freqs
    .join(top_k_tokens, on='tokens', how='inner')
    .select([
        pl.col('*'),
        (pl.col('frequency') - pl.mean('frequency').over('tokens')
            / pl.std('frequency').over('tokens')).alias('z_score')
    ])
)

ColumnNotFoundError: index

In [22]:
import numpy as np

token_counts = np.array([[2, 1, 1, np.nan, np.nan],
                         [1, 2, 1, 1, 1],
                         [np.nan, np.nan, np.nan, 4, 1]])

means = np.nanmean(token_counts, axis=0)
stds = np.nanstd(token_counts, axis=0, ddof=1)

print(means, stds)

token_counts = (token_counts - means) / stds
token_counts

[1.5 1.5 1.  2.5 1. ] [0.70710678 0.70710678 0.         2.12132034 0.        ]


  token_counts = (token_counts - means) / stds


array([[ 0.70710678, -0.70710678,         nan,         nan,         nan],
       [-0.70710678,  0.70710678,         nan, -0.70710678,         nan],
       [        nan,         nan,         nan,  0.70710678,         nan]])

In [16]:
np.array([0, 1, 4]).std()

np.float64(1.699673171197595)

In [1]:
import polars as pl
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

# Load Moby Dick
moby_dick = load_etext(2701, mirror='https://gutenberg.pglaf.org/')
moby_dick = strip_headers(moby_dick).strip()

sherlock = load_etext(1661, mirror='https://gutenberg.pglaf.org/')
sherlock = strip_headers(sherlock).strip()

scarlet = load_etext(244, mirror='https://gutenberg.pglaf.org/')
scarlet = strip_headers(scarlet).strip()

df = pl.DataFrame(
    {
        'index': pl.arange(3, eager=True),
        'authors': ['Hermann Melville', 'Arthur Conan Doyle', 'Arthur Conan Doyle'],
        'books': ['Moby Dick', 'The Adventures of Sherlock Holmes', 'A Study in Scarlet'],
        'texts': [moby_dick, sherlock, scarlet],
    }
)

STOPWORDS = {
    "he", "her", "hers", "herself", "him", "himself", "his", "i", "me", "mine", "my", "myself", 
    "our", "ours", "ourselves", "she", "thee", "their", "them", "themselves", "they", "thou", 
    "thy", "thyself", "us", "we", "ye", "you", "your", "yours", "yourself"
}

df = df.with_columns(
    pl.col('texts')
    .str.replace_all("[`']", '')
    .str.extract_all('\w+')
    .list.eval(
        pl.element().filter(
            ~pl.element().is_in(STOPWORDS) | ~pl.element().str.contains('.*\d+.*')
        )
    )
    .alias('tokens')
# Calculate number of tokens in each text
)

top_token_freqs = (
    df.select(
        pl.col('tokens')
        .explode()
        .value_counts(sort=True)
    )
    .unnest('tokens')
    .limit(500)
)

df = df.explode('tokens')

frequency_df = (
    df
    .filter(
        pl.col('tokens').is_in(top_token_freqs.select('tokens'))
    )
    .group_by(['index', 'tokens'])
    .agg(pl.len().alias('frequency'))
)

unique_tokens = frequency_df.select('tokens').unique().get_column('tokens')

pivoted_df = (
    frequency_df.pivot(
        values='frequency',
        index='index',
        on='tokens',
        aggregate_function='first'
    )
    .fill_null(0)
)

# pivoted_df = (
#     frequency_df
#     .group_by('index')
#     .agg([
#         pl.when(pl.col('tokens') == token)
#         .then(pl.col('frequency'))
#         .otherwise(0)
#         .sum()
#         .alias(token)
#         for token in unique_tokens  # Create a column for each unique token
#     ])
# )


result = (
    df
    .select(['index', 'authors', 'books', 'tokens', 'texts'])
    .unique()
    .join(pivoted_df, on='index', how='left')
)

: 

In [73]:
df.select('tokens').dtypes[0] == pl.String

True

In [96]:
df.select('tokens').dtypes[0] == pl.String

True

In [1]:
df.explode('tokens')

frequency_df = (
    df
    .filter(
        pl.col('tokens').is_in(top_token_freqs.select('tokens'))
    )
    .group_by(['index', 'tokens'])
    .agg(pl.len().alias('frequency'))
)
pivoted_df = (
    frequency_df.pivot(
        values='frequency',
        index='index',
        on='tokens',
        aggregate_function='first'
    )
    .fill_null(0)
)

df = (
    df
    .select(['index', 'authors', 'books', 'tokens', 'texts'])
    .unique()
    .join(pivoted_df, on='index', how='left')
)

df

NameError: name 'df' is not defined

In [86]:
columns_to_normalize = [col for col in df.columns if col not in ['index', 'authors', 'books', 'tokens', ]]

print(df.columns)
    
# Calculate mean and standard deviation once
stats = df.select([
    pl.col(columns_to_normalize).mean().alias('mean'),
    pl.col(columns_to_normalize).std().alias('std')
])

# Perform normalization
normalized = df.select([
    pl.col('index'),
    pl.col('authors'),
    pl.col('books'),
    *[((pl.col(col) - pl.col('mean')) / pl.col('std')).fill_nan(0).alias(col)
        for col in columns_to_normalize]
])

normalized.join(stats, how='cross')



# (token_counts - token_counts.mean()) / token_counts.std()

['index', 'authors', 'books', 'texts', 'tokens']


InvalidOperationError: `std` operation not supported for dtype `str`

In [13]:
df = df.with_columns([
    pl.col('tokens')
    .list.eval(
        pl.element()
        .explode()
        .value_counts(sort=True)
        .struct.rename_fields(('token', 'count'))
    )
    .alias('token_counts')
])

# .map_elements(lambda x: pl.fold(
#         acc=pl.struct({'temp': 1}),
#         function=lambda acc, y: acc.extend({y['token']: y['count']}),
#         exprs=x
#     ))

# .map_rows(lambda x: pl.fold(
#     acc=pl.struct(dummy=pl.lit(None)),  # Start with a dummy field
#     function=lambda acc, y: acc.struct.extend(pl.struct({y['token']: y['count']})),
#     exprs=pl.col('token_counts')
# ), return_dtype=pl.Struct([pl.Field('token_counts', pl.Object)]))

In [7]:
# unique_tokens = (
#     df
#     .explode('tokens')  # Explode to have one token per row
#     .select('tokens')   # Select only the tokens column
#     .unique()           # Get unique tokens
#     .to_series()        # Convert to Series
#     .to_list()          # Convert Series to a list of unique tokens
# )

# (
#     df.explode('tokens')
#     .group_by(('authors', 'books', 'tokens'))
#     .agg(
#         pl.len().alias('frequency')
#     )
#     .group_by(['authors', 'books'])
#     .agg([
#             pl.when(pl.col('token') == token)
#             .then(pl.col('frequency'))
#             .otherwise(None)
#             .sum()
#             # .over(['authors', 'books'])
#             .alias(token)
#             for token in unique_tokens  # Create one column for each unique token
#     ])
# )


In [11]:
unique_tokens = exploded.select('token').unique()

# Prepare pivot expressions
pivot_exprs = [
    pl.when(pl.col('token') == token)
    .then(pl.col('frequency'))
    .otherwise(None)
    .sum()
    .over(['authors', 'books'])
    .alias(token)
    for token in unique_tokens.get_column('token')
]

# Apply pivot expressions
pivoted = exploded.group_by(['authors', 'books']).agg(pivot_exprs)

return pivoted.select(
    ['authors', 'books'] + 
    [pl.col(token).fill_null(0) for token in unique_tokens.collect().get_column('token')]
)

StructFieldNotFoundError: token

This error occurred with the following context stack:
	[1] 'select' failed
	[2] 'select' input failed to resolve
	[3] 'unique' input failed to resolve


In [43]:
top_tokens = df.select(
    pl.col('tokens')
    .explode()
    .value_counts(sort=True, normalize=True)
).unnest('tokens').head(500)

In [48]:
token_freqs = df.group_by(('authors', 'books')).agg(
    pl.col('tokens')
    .list.explode()
    .value_counts(sort=True, normalize=True)
    .alias('token_freqs')
)

# Alternative that preserves all columns
token_freqs = df.with_columns(
    pl.col('tokens')
    .list.eval(
        pl.element()
        .value_counts(sort=True, normalize=True)
        .struct.rename_fields(('token', 'frequency'))
    )
    .list.eval(
        pl.element().filter(
            pl.element().struct.field('token').is_in(top_tokens.select('tokens'))
        )
    )
    .alias('token_freqs')
)

token_freqs

SchemaError: invalid series dtype: expected `Struct`, got `str`

In [19]:
token_counts = df.select([
    pl.col('authors'),
    pl.col('books'),
    pl.col('tokens').list.explode().alias('token')
]).group_by(('authors', 'books', 'token')).count()

ComputeError: Series length 3 doesn't match the DataFrame height of 369571

In [17]:
token_counts.select('token_counts').schema

Schema([('token_counts', List(Struct({'token': String, 'count': UInt32})))])

In [38]:
# Calculate word frequencies
word_freqs = token_counts.with_columns([
    pl.col('token_counts').struct.field('count')
])

word_freqs

authors,books,texts,tokens,token_count,token_counts,count
str,str,str,list[str],u32,struct[2],struct[2]
"""Hermann Melville""","""Moby Dick""","""MOBY-DICK; or, THE WHALE. By…","[""MOBY"", ""DICK"", … ""orphan""]",219454,"{{""the"",13813},{""of"",6592}}","{""of"",6592}"
"""Arthur Conan Doyle""","""The Adventures of Sherlock Hol…","""HOLMES *** The Adventures …","[""HOLMES"", ""The"", … ""success""]",105956,"{{""the"",5263},{""I"",3038}}","{""I"",3038}"
"""Arthur Conan Doyle""","""A Study in Scarlet""","""A STUDY IN SCARLET By A. Cona…","[""A"", ""STUDY"", … ""arca""]",44161,"{{""the"",2326},{""and"",1321}}","{""and"",1321}"


In [8]:
# Get top tokens over whole corpus - alternate implementation
top_token_counts = df.select(
    pl.col('tokens')
    .explode()
    .value_counts(sort=True)
    .head(50)
    .struct.field(['tokens', 'count'])
)

top_token_counts

tokens,count
str,u32
"""the""",21402
"""of""",10416
"""and""",10214
"""to""",8355
"""a""",8097
…,…
"""man""",965
"""been""",955
"""up""",940
"""no""",927


In [8]:
# Get length of each book
df = df.with_columns(
    pl.col('tokens')
    .list.len()
    .alias('token_count')
)

df

authors,books,texts,tokens,token_count
str,str,str,list[str],u32
"""Hermann Melville""","""Moby Dick""","""MOBY-DICK; or, THE WHALE. By…","[""MOBY"", ""DICK"", … ""orphan""]",219454
"""Arthur Conan Doyle""","""The Adventures of Sherlock Hol…","""HOLMES *** The Adventures …","[""HOLMES"", ""The"", … ""success""]",105956
"""Arthur Conan Doyle""","""A Study in Scarlet""","""A STUDY IN SCARLET By A. Cona…","[""A"", ""STUDY"", … ""arca""]",44161


In [22]:
# Get token counts of top tokens for each book
df = df.with_columns(
    pl.col('tokens')
    .list.eval(
        pl.element().filter(
            pl.element().is_in(top_token_counts.select('tokens'))
        )
        .value_counts()
    )
    .list.to_struct()
    .alias('top_token_counts')
)

df

authors,books,texts,tokens,top_token_counts
str,str,str,list[str],struct[50]
"""Hermann Melville""","""Moby Dick""","""MOBY-DICK; or, THE WHALE. By…","[""MOBY"", ""DICK"", … ""orphan""]","{{""of"",6592},{""as"",1621},{""there"",716},{""me"",627},{""up"",507},{""have"",760},{""he"",1662},{""one"",893},{""but"",1113},{""my"",564},{""so"",918},{""is"",1698},{""to"",4561},{""for"",1421},{""all"",1466},{""by"",1144},{""the"",13813},{""you"",841},{""no"",486},{""was"",1634},{""man"",508},{""from"",1061},{""a"",4591},{""on"",1016},{""this"",1281},{""when"",554},{""out"",529},{""an"",582},{""in"",3942},{""be"",1030},{""or"",703},{""not"",1105},{""that"",2981},{""are"",587},{""said"",304},{""at"",1236},{""had"",767},{""we"",413},{""were"",679},{""his"",2472},{""been"",415},{""him"",1060},{""it"",2210},{""upon"",540},{""with"",1663},{""I"",2120},{""and"",6075},{""s"",1800},{""The"",702},{""which"",642}}"
"""Arthur Conan Doyle""","""The Adventures of Sherlock Hol…","""HOLMES *** The Adventures …","[""HOLMES"", ""The"", … ""success""]","{{""to"",2715},{""for"",695},{""are"",329},{""me"",635},{""the"",5263},{""by"",322},{""The"",341},{""his"",1103},{""that"",1651},{""which"",763},{""one"",338},{""when"",265},{""upon"",463},{""you"",1231},{""was"",1392},{""but"",469},{""on"",336},{""we"",412},{""so"",414},{""be"",624},{""no"",299},{""up"",300},{""there"",343},{""in"",1692},{""man"",302},{""my"",907},{""a"",2538},{""I"",3038},{""been"",393},{""not"",612},{""at"",729},{""with"",803},{""and"",2818},{""of"",2625},{""as"",770},{""he"",1165},{""had"",822},{""this"",405},{""from"",477},{""it"",1290},{""have"",904},{""s"",365},{""is"",1106},{""all"",370},{""or"",194},{""were"",345},{""him"",434},{""out"",319},{""an"",323},{""said"",486}}"
"""Arthur Conan Doyle""","""A Study in Scarlet""","""A STUDY IN SCARLET By A. Cona…","[""A"", ""STUDY"", … ""arca""]","{{""been"",147},{""not"",170},{""be"",248},{""as"",299},{""s"",190},{""me"",212},{""an"",128},{""are"",132},{""up"",133},{""at"",289},{""which"",315},{""or"",112},{""his"",613},{""out"",120},{""this"",178},{""that"",619},{""a"",968},{""when"",100},{""with"",313},{""from"",173},{""on"",190},{""had"",470},{""have"",276},{""of"",1199},{""we"",142},{""and"",1321},{""it"",453},{""one"",150},{""you"",367},{""was"",647},{""to"",1079},{""upon"",195},{""all"",173},{""in"",673},{""but"",172},{""man"",155},{""is"",288},{""him"",271},{""no"",142},{""The"",190},{""there"",160},{""he"",630},{""my"",273},{""so"",139},{""said"",207},{""I"",943},{""were"",169},{""by"",152},{""the"",2326},{""for"",303}}"


In [247]:
# Get token counts of top tokens for each author - SEE IF THERE IS A WAY TO MERGE TOP_TOKEN_COUNTS STRUCTS FOR FASTER IMPLEMENTATION
df.group_by('authors').agg(
    pl.col('tokens')
    .list.eval(
        pl.element().filter(
            pl.element().is_in(top_token_counts.select('tokens'))
        )
    )
    .explode()
    .value_counts()
    .alias('top_token_counts')
).with_columns(
    pl.col('top_token_counts')
    .list.to_struct()
)

authors,top_token_counts
str,struct[50]
"""Arthur Conan Doyle""","{{""out"",439},{""upon"",658},{""so"",553},{""me"",847},{""been"",540},{""from"",650},{""we"",554},{""are"",461},{""have"",1180},{""with"",1116},{""were"",514},{""by"",474},{""had"",1292},{""that"",2270},{""no"",441},{""when"",365},{""is"",1394},{""in"",2365},{""he"",1795},{""you"",1598},{""as"",1069},{""this"",583},{""up"",433},{""I"",3981},{""for"",998},{""and"",4139},{""him"",705},{""his"",1716},{""all"",543},{""was"",2039},{""there"",503},{""on"",526},{""but"",641},{""said"",693},{""at"",1018},{""the"",7589},{""be"",872},{""which"",1078},{""to"",3794},{""or"",306},{""an"",451},{""my"",1180},{""one"",488},{""a"",3506},{""it"",1743},{""The"",531},{""not"",782},{""of"",3824},{""man"",457},{""s"",555}}"
"""Hermann Melville""","{{""s"",1800},{""from"",1061},{""the"",13813},{""all"",1466},{""out"",529},{""me"",627},{""my"",564},{""and"",6075},{""to"",4561},{""by"",1144},{""had"",767},{""with"",1663},{""him"",1060},{""be"",1030},{""there"",716},{""when"",554},{""his"",2472},{""but"",1113},{""on"",1016},{""this"",1281},{""The"",702},{""a"",4591},{""as"",1621},{""was"",1634},{""that"",2981},{""have"",760},{""no"",486},{""for"",1421},{""it"",2210},{""is"",1698},{""one"",893},{""of"",6592},{""in"",3942},{""you"",841},{""not"",1105},{""been"",415},{""upon"",540},{""said"",304},{""are"",587},{""which"",642},{""so"",918},{""we"",413},{""at"",1236},{""up"",507},{""or"",703},{""were"",679},{""an"",582},{""I"",2120},{""he"",1662},{""man"",508}}"


In [41]:
# Get token counts of top tokens for each author-book combination
df.group_by(['authors', 'books']).agg(
    pl.col('tokens')
    .list.eval(
        pl.element().filter(
            pl.element().is_in(top_token_counts.select('tokens'))
        )
    )
    .explode()
    .value_counts()
    .alias('top_token_counts')
).with_columns(
    pl.col('top_token_counts')
    .list.to_struct()
)

authors,books,top_token_counts
str,str,struct[50]
"""Arthur Conan Doyle""","""The Adventures of Sherlock Hol…","{{""there"",343},{""have"",904},{""his"",1103},{""was"",1392},{""that"",1651},{""up"",300},{""man"",302},{""been"",393},{""but"",469},{""at"",729},{""are"",329},{""the"",5263},{""one"",338},{""as"",770},{""a"",2538},{""which"",763},{""him"",434},{""out"",319},{""on"",336},{""you"",1231},{""is"",1106},{""from"",477},{""The"",341},{""and"",2818},{""of"",2625},{""in"",1692},{""with"",803},{""said"",486},{""no"",299},{""me"",635},{""he"",1165},{""we"",412},{""this"",405},{""it"",1290},{""upon"",463},{""I"",3038},{""or"",194},{""s"",365},{""by"",322},{""not"",612},{""for"",695},{""were"",345},{""all"",370},{""my"",907},{""to"",2715},{""when"",265},{""so"",414},{""an"",323},{""had"",822},{""be"",624}}"
"""Arthur Conan Doyle""","""A Study in Scarlet""","{{""in"",673},{""by"",152},{""him"",271},{""from"",173},{""on"",190},{""The"",190},{""but"",172},{""me"",212},{""the"",2326},{""been"",147},{""or"",112},{""his"",613},{""he"",630},{""my"",273},{""it"",453},{""not"",170},{""to"",1079},{""we"",142},{""that"",619},{""s"",190},{""as"",299},{""and"",1321},{""with"",313},{""at"",289},{""there"",160},{""a"",968},{""man"",155},{""all"",173},{""I"",943},{""for"",303},{""upon"",195},{""have"",276},{""an"",128},{""up"",133},{""this"",178},{""of"",1199},{""so"",139},{""was"",647},{""had"",470},{""were"",169},{""one"",150},{""out"",120},{""when"",100},{""is"",288},{""said"",207},{""you"",367},{""no"",142},{""be"",248},{""which"",315},{""are"",132}}"
"""Hermann Melville""","""Moby Dick""","{{""at"",1236},{""was"",1634},{""this"",1281},{""is"",1698},{""I"",2120},{""which"",642},{""of"",6592},{""as"",1621},{""one"",893},{""from"",1061},{""The"",702},{""him"",1060},{""he"",1662},{""said"",304},{""no"",486},{""there"",716},{""we"",413},{""upon"",540},{""on"",1016},{""were"",679},{""but"",1113},{""his"",2472},{""up"",507},{""man"",508},{""you"",841},{""have"",760},{""or"",703},{""the"",13813},{""be"",1030},{""that"",2981},{""for"",1421},{""a"",4591},{""s"",1800},{""not"",1105},{""to"",4561},{""out"",529},{""it"",2210},{""been"",415},{""me"",627},{""my"",564},{""when"",554},{""in"",3942},{""are"",587},{""with"",1663},{""so"",918},{""an"",582},{""all"",1466},{""had"",767},{""by"",1144},{""and"",6075}}"
