In [None]:
import pandas as pd
import numpy as np
import pickle

%matplotlib inline

In [None]:
with open("datasets/parsed_filtered_df.pkl", "rb") as handle:
    main_df = pickle.load(handle)

In [None]:
def search_df(df, search_terms, search_exclusive=False):
    # Lowercase search terms
    search_terms = [t.lower() for t in search_terms]
    
    # Create a boolean array to subset the dataframe with search matching terms
    if search_exclusive:
        search_filter_bool = np.ones(len(df), dtype=bool)
        
        for term in search_terms:
            search_filter_bool = search_filter_bool & df['main'].str.lower().str.contains(term)
    else:
        search_filter_bool = np.zeros(len(df), dtype=bool)

        for term in search_terms:
            search_filter_bool = search_filter_bool | df['main'].str.lower().str.contains(term)
    
    return df[search_filter_bool]

In [None]:
# If we want to see only tweets with an opinion (because the mean takes zeros into account)
# Also, makes computations a lot faster
main_df_opinion = main_df[main_df["sentiment"] != 0]

## Example 1: swiss trains by state

In [None]:
cff_search = search_df(main_df_opinion, ["cff", "sbb", "ffs"])

In [None]:
cff_search.groupby("geo_state").mean().sort_values(by="sentiment").plot(kind="bar")

In [None]:
def filter_lang(df, langs):
    return df[df['lang'].isin(langs)]

In [None]:
def keep_relevant_states(df, threshold = 20):
    return df.groupby("geo_state").filter(lambda x: x.count()["main"] > threshold)

## Example 2: saudi arabia by gender

In [None]:
saudi_search = search_df(main_df_opinion, ["saudi arabia"])
saudi_search.groupby("author_gender").mean()

## Example 3: Hillary vs Trump by state

In [None]:
trump_search = search_df(main_df_opinion, ["donald", "trump"], search_exclusive=True)
trump_search = keep_relevant_states(trump_search)
trump_search.groupby("geo_state").mean().sort_values("sentiment").plot(kind="bar")

In [None]:
trump_search["sentiment"].mean()

In [None]:
hillary_search = search_df(main_df_opinion, ["hillary", "clinton"], search_exclusive=True)
hillary_search = keep_relevant_states(hillary_search)
hillary_search.groupby("geo_state").mean().sort_values("sentiment").plot(kind="bar")

In [None]:
hillary_search["sentiment"].mean()

## Example 4: Marijuana by state

In [None]:
weed_search = search_df(main_df_opinion, ["marijuana", "cannabis", "weed"], search_exclusive=False)
weed_search = keep_relevant_states(weed_search)
weed_search.groupby("geo_state").mean().sort_values("sentiment").plot(kind="bar")

## Example 5: bad example, most tweets are radio tweets

In [None]:
bieber_search = search_df(main_df_opinion, ["Justin", "Bieber"], search_exclusive=True)
bieber_search = keep_relevant_states(bieber_search)
bieber_search.groupby("geo_state").mean().sort_values("sentiment").plot(kind="bar")

In [None]:
bieber_search[bieber_search["geo_state"] == "Aargau"]

## Example 6: Roesti mentions, by state

In [None]:
# Ratio of roesti tweets
roesti_search = search_df(main_df, ["rösti", "roesti", "röschti", "roeschti"], search_exclusive=False)
roesti_tweets = roesti_search.groupby("geo_state")["main"].count()

total_tweets = main_df.groupby("geo_state")["main"].count()
(roesti_tweets / total_tweets).dropna().sort_values().plot(kind="bar")

## Example 7: McDonalds, by state

In [None]:
mcdo_search = search_df(main_df_opinion, ["mcdo"], search_exclusive=True)
mcdo_search = keep_relevant_states(mcdo_search)
mcdo_search.groupby("geo_state").mean().sort_values("sentiment").plot(kind="bar")

## Example 8: Are people happier on the week-end?

In [None]:
main_df_opinion["weekday"] = main_df_opinion["published"].apply(lambda x: x.weekday())

In [None]:
main_df_opinion[main_df_opinion["geo_state"] == "Zurich"].groupby("weekday").mean()

Other ideas:
* implication of canton into subject (count(subject) / count(total))