In [None]:
import pandas as pd
import numpy as np
import pickle
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.corpus import stopwords
import folium
from datetime import datetime
%matplotlib inline

In [None]:
main_df = pd.DataFrame()

In [None]:
available_paths = ["twitter_august.pkl",  "twitter_july.pkl",  "twitter_october.pkl",  "twitter_september.pkl"]
available_paths_complete = ["datasets/filtered_dfs/{}".format(path) for path in available_paths]

In [None]:
for path in available_paths_complete:
    with open(path, "rb") as handle:
        new_df = pickle.load(handle)
        main_df = main_df.append(new_df)

In [None]:
len(main_df)

In [None]:
main_df.isnull().sum()

In [None]:
print(len(main_df[main_df["sentiment"] == "POSITIVE"]))
print(len(main_df[main_df["sentiment"] == "NEGATIVE"]))
print(len(main_df[main_df["sentiment"] == "NEUTRAL"]))

In [None]:
positive_corpus = " ".join(main_df[main_df["sentiment"] == "POSITIVE"]["main"])
negative_corpus = " ".join(main_df[main_df["sentiment"] == "NEGATIVE"]["main"])

In [None]:
wordcloud = WordCloud().generate(positive_corpus)
plt.imshow(wordcloud)
plt.axis("off")

In [None]:
wordcloud = WordCloud().generate(negative_corpus)
plt.imshow(wordcloud)
plt.axis("off")

In [None]:
main_df.groupby("geo_state").count()["main"]

In [None]:
# Removing geo_state with few tweets, because they are not swiss cantons. ~0.4% of the data
main_df = main_df.groupby("geo_state").filter(lambda x: x.count()["main"] > 1000)

In [None]:
main_df.groupby("geo_state").count()["main"]

In [None]:
fake_cantons = ["Baden-Württemberg", "Haryana", "North Rhine-Westphalia"]

In [None]:
# Removing fake_cantons
main_df = main_df[~(main_df["geo_state"].isin(fake_cantons))]

In [None]:
def merge_clean_df(pickle_list):
    '''Import DataFrames from different pickle files, merge them and clean the data'''
    # == MERGE ==
    main_df = pd.DataFrame()
    paths = ["datasets/filtered_dfs/{}".format(path) for path in pickle_list]
    
    for path in available_paths_complete:
        with open(path, "rb") as handle:
            new_df = pickle.load(handle)
            main_df = main_df.append(new_df)

    # == CLEAN ==
    # Remove geo_state with few tweets, because they are not swiss cantons. ~0.4% of the data
    main_df = main_df.groupby("geo_state").filter(lambda x: x.count()["main"] > 1000)
    fake_cantons = ["Baden-Württemberg", "Haryana", "North Rhine-Westphalia"]
    
    # Removing fake_cantons
    main_df = main_df[~(main_df["geo_state"].isin(fake_cantons))]
    
    return main_df

In [None]:
main_df.groupby("geo_state").mean().sort_values(by="sentiment_int").plot(kind="bar")

In [None]:
sbb_words = ["cff", "sbb", "ffs"]

In [None]:
def contains_str(string):
    return main_df["main"].str.contains(string)

In [None]:
cff_sentiment = main_df[contains_str("cff") | contains_str("sbb") | contains_str("ffs")].groupby("geo_state").mean().sort_values(by="sentiment_int")
cff_sentiment

In [None]:
main_df.geo_state.unique()

In [None]:
def append_state_code(df):
    '''Adds state code in a new column, in place'''
    
    state_to_code = {
        'Zurich': 'ZH', 
        'Solothurn': 'SO', 
        'Geneva': 'GE', 
        'Lucerne': 'LU', 
        'Thurgau': 'TG', 
        'Jura': 'JU',
        'Grisons': 'GR', 
        'Valais': 'VS', 
        'Fribourg': 'FR', 
        'Bern': 'BE', 
        'Schaffhausen': 'SH', 
        'Schwyz': 'SZ',
        'Vaud': 'VD', 
        'Saint Gallen': 'SG', 
        'Neuchâtel': 'NE', 
        'Aargau': 'AG', 
        'Ticino': 'TI',
        'Basel-City': 'BS', 
        'Basel-Landschaft': 'BL', 
        'Obwalden': 'OW', 
        'Zug': 'ZG', 
        'Uri': 'UR',
        'Glarus': 'GL', 
        'Nidwalden': 'NW', 
        'Appenzell Innerrhoden': 'AI',
        'Appenzell Ausserrhoden': 'AR'
    }
    
    df['state_code'] = [state_to_code[index] for index in df.index.values]


In [None]:
append_state_code(cff_sentiment)

In [None]:
geo_path = 'utils/ch-cantons.topojson.json'
    

cff_map = folium.Map(location=[46.57, 8], zoom_start=8)
cff_map.choropleth(geo_path=geo_path, 
                     data=cff_sentiment,
                     columns=['state_code', 'sentiment_int'],
                     key_on='feature.id',
                     topojson='objects.cantons',
                     fill_color='YlGn'
                    )
cff_map

In [None]:
cff_sentiment.plot(kind="bar")

In [None]:
all_sentiment = main_df.groupby("geo_state").mean()
append_state_code(all_sentiment)
all_sentiment.sort_values(by="sentiment_int")

In [None]:
geo_path = 'utils/ch-cantons.topojson.json'
    

all_map = folium.Map(location=[46.57, 8], zoom_start=8)
all_map.choropleth(geo_path=geo_path, 
                     data=all_sentiment,
                     columns=['state_code', 'sentiment_int'],
                     key_on='feature.id',
                     topojson='objects.cantons',
                     fill_color='YlGn'
                    )
all_map

In [None]:
all_sentiment_en = main_df[main_df["lang"] == 'en'].groupby("geo_state").mean()
append_state_code(all_sentiment_en)
all_sentiment_en.sort_values(by="sentiment_int")

In [None]:
geo_path = 'utils/ch-cantons.topojson.json'
    

all_en_map = folium.Map(location=[46.57, 8], zoom_start=8)
all_en_map.choropleth(geo_path=geo_path, 
                     data=all_sentiment_en,
                     columns=['state_code', 'sentiment_int'],
                     key_on='feature.id',
                     topojson='objects.cantons',
                     fill_color='YlGn'
                    )
all_en_map

**We think that the sentiment analysis works best on english, and gives lots of "NEUTRAL" values with other languages. It's then better to use only english (~45% of the tweets)**

In [None]:
def search_df(df, search_terms, search_exclusive=False, langs=[]):
    # Lowercase search terms
    search_terms = [t.lower() for t in search_terms]
    
    if len(langs) > 0:
        lang_filtered = df[df['lang'].isin(langs)]
    else:
        lang_filtered = df
    
    # Create a boolean array to subset the dataframe with search matching terms
    if search_exclusive:
        search_filter_bool = np.ones(len(lang_filtered), dtype=bool)
        
        for term in search_terms:
            search_filter_bool = search_filter_bool & lang_filtered['main'].str.lower().str.contains(term)
    else:
        search_filter_bool = np.zeros(len(lang_filtered), dtype=bool)

        for term in search_terms:
            search_filter_bool = search_filter_bool | lang_filtered['main'].str.lower().str.contains(term)
    
    return lang_filtered[search_filter_bool]

In [None]:
search_df(main_df, search_terms=['sbb', 'cff', 'ffs']).sample(5)['main']

In [None]:
search_df(main_df, search_terms=['paleo', 'festival'], langs=['en'], search_exclusive=True)

In [None]:
main_df.groupby("lang").count().sort_values(by="main", ascending=False)["main"].head(10)

**main languages are English (en), German (de), French (fr), Spanish (es) and Italian (it)**

**Seems like there is more spanish than italian... Might be a problem with the language detection**

In [None]:
main_df[main_df['lang'] == 'es'].groupby('geo_state').count().sort_values('main', ascending=False)['main']

In [None]:
main_df[main_df['lang'] == 'it'].groupby('geo_state').count().sort_values('main', ascending=False)['main']

In [None]:
main_df[main_df['lang'] == 'pt'].groupby('geo_state').count().sort_values('main', ascending=False)['main']

In [None]:
def append_date(df):
    return df["published"].apply(parse_date)

In [None]:
main_df.sample()["published"].to_string(index=False)

In [None]:
def parse_date(string):
    datetime_fmt = "%Y-%m-%dT%H:%M:%SZ"
    
    return datetime.strptime(string, datetime_fmt)

In [None]:
main_df.sample(10)["published"].apply(parse_date)

In [None]:
main_df["published"] = main_df["published"].apply(parse_date)

In [None]:
#with open("datasets/parsed_filtered_df.pkl", "wb") as handle:
#    pickle.dump(main_df, handle)