In [None]:
import pandas as pd
import pickle
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.corpus import stopwords
%matplotlib inline

In [None]:
main_df = pd.DataFrame()

In [None]:
available_paths = ["twitter_august.pkl",  "twitter_july.pkl",  "twitter_october.pkl",  "twitter_september.pkl"]
available_paths_complete = ["datasets/filtered_dfs/{}".format(path) for path in available_paths]

In [None]:
for path in available_paths_complete:
    with open(path, "rb") as handle:
        new_df = pickle.load(handle)
        main_df = main_df.append(new_df)

In [None]:
len(main_df)

In [None]:
main_df.isnull().sum()

In [None]:
print(len(main_df[main_df["sentiment"] == "POSITIVE"]))
print(len(main_df[main_df["sentiment"] == "NEGATIVE"]))
print(len(main_df[main_df["sentiment"] == "NEUTRAL"]))

In [None]:
positive_corpus = " ".join(main_df[main_df["sentiment"] == "POSITIVE"]["main"])
negative_corpus = " ".join(main_df[main_df["sentiment"] == "NEGATIVE"]["main"])

In [None]:
wordcloud = WordCloud().generate(positive_corpus)
plt.imshow(wordcloud)
plt.axis("off")

In [None]:
wordcloud = WordCloud().generate(negative_corpus)
plt.imshow(wordcloud)
plt.axis("off")

In [None]:
main_df.groupby("geo_state").count()["main"]

In [None]:
# Removing geo_state with few tweets, because they are not swiss cantons. ~0.4% of the data
main_df = main_df.groupby("geo_state").filter(lambda x: x.count()["main"] > 1000)

In [None]:
main_df.groupby("geo_state").count()["main"]

In [None]:
fake_cantons = ["Baden-Württemberg", "Haryana", "North Rhine-Westphalia"]

In [None]:
# Removing fake_cantons
main_df = main_df[~(main_df["geo_state"].isin(fake_cantons))]

In [None]:
def merge_clean_df(pickle_list):
    '''Import DataFrames from different pickle files, merge them and clean the data'''
    # == MERGE ==
    main_df = pd.DataFrame()
    paths = ["datasets/filtered_dfs/{}".format(path) for path in pickle_list]
    
    for path in available_paths_complete:
        with open(path, "rb") as handle:
            new_df = pickle.load(handle)
            main_df = main_df.append(new_df)

    # == CLEAN ==
    # Remove geo_state with few tweets, because they are not swiss cantons. ~0.4% of the data
    main_df = main_df.groupby("geo_state").filter(lambda x: x.count()["main"] > 1000)
    fake_cantons = ["Baden-Württemberg", "Haryana", "North Rhine-Westphalia"]
    
    # Removing fake_cantons
    main_df = main_df[~(main_df["geo_state"].isin(fake_cantons))]
    
    return main_df

In [None]:
main_df.groupby("geo_state").mean().sort_values(by="sentiment_int").plot(kind="bar")

In [None]:
sbb_words = ["cff", "sbb", "ffs"]

In [None]:
def contains_str(string):
    return main_df["main"].str.contains(string)

In [None]:
main_df[contains_str("cff") | contains_str("sbb") | contains_str("ffs")].groupby("geo_state").mean().sort_values(by="sentiment_int")