In [1]:
import pandas as pd
import numpy as np
import pickle
from random import random

%matplotlib inline

In [2]:
with open("datasets/parsed_filtered_df.pkl", "rb") as handle:
    main_df = pickle.load(handle)

In [3]:
main_df_opinion = main_df[main_df["sentiment"] != 0]

In [4]:
def filter_lang(df, langs):
    """ filter the df with one or several languages """
    return df[df['lang'].isin(langs)]

def filter_relevant_states(df, threshold):
    """ keep states that have more than `threshold` tweets """
    return df.groupby("geo_state").filter(lambda x: x.count()["main"] > threshold)

def append_weekday(df):
    df["weekday"] = df["published"].apply(lambda x: x.weekday())

def filter_weekday(df, days): #drop weekday after?
    return df[df['weekday'].isin(days)]

def filter_df(df, langs = ['en'], days = [0, 1, 2, 3, 4, 5, 6], threshold=0):
    # Language filter
    df = filter_lang(df, langs)
    # Threshold filter if necessary
    if threshold > 0:
        df = filter_relevant_states(df, threshold)
    # Weekday filter
    append_weekday(df)
    df = filter_weekday(df, days)
    df.drop('weekday', 1)
    return df

In [5]:
filter_df(main_df_opinion, ['es'], [0,1,2]).sample(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,author_gender,geo_state,lang,main,published,source_location,sentiment,weekday
1125682,MALE,Basel-City,es,@jkn_vk bastantes menos de los que usted cree ...,2016-07-19 19:35:03,"London, sometimes Switzerland, each time less ...",1.0,1


In [9]:
def get_happy_sad_tweet(df, happy):
    if happy:
        selected_state = main_df_opinion.groupby("geo_state").mean().sort_values(by='sentiment', ascending=False).index[0]
    else:
        selected_state = main_df_opinion.groupby("geo_state").mean().sort_values(by='sentiment', ascending=False).index[-1]
    # getting indexes of tweets in the happiest/saddest state
    indexes = main_df_opinion[main_df_opinion['geo_state'] == selected_state]['main'].index
    # randomly selecting one of them
    indexes_list = list(indexes)
    random_index = int(random()*len(list(indexes)))
    tweet_selected_index = indexes_list[random_index]
    return main_df_opinion.loc[tweet_selected_index]['main']
    

In [10]:
get_happy_sad_tweet(main_df_opinion, False)

'global warming is my inspiration'

In [11]:
happy_state = main_df_opinion.groupby("geo_state").mean().sort_values(by='sentiment', ascending=False).index[0]
sad_state = main_df_opinion.groupby("geo_state").mean().sort_values(by='sentiment', ascending=False).index[-1]

indexes = main_df_opinion[main_df_opinion['geo_state'] == happy_state]['main'].index
indexes_list = list(indexes)
random_index = int(random()*len(list(indexes)))
tweet_selected_index = indexes_list[random_index]
main_df_opinion.loc[tweet_selected_index]['main']

'Oh wie schön könnte das Lebens ein :) heute Morgen einfach mal um CHF 13500 reicher :D #spam'

### Counting

In [14]:
def search_df(df, search_terms, search_exclusive=False):
    """ Searches the df for either one of the terms. If `search_exclusive` is True, then searches the df for entries that have all terms """
    # Lowercase search terms
    search_terms = [t.lower() for t in search_terms]

    # Create a boolean array to subset the dataframe with search matching terms
    if search_exclusive:
        search_filter_bool = np.ones(len(df), dtype=bool)

        for term in search_terms:
            search_filter_bool = search_filter_bool & df['main'].str.lower().str.contains(term)
    else:
        search_filter_bool = np.zeros(len(df), dtype=bool)

        for term in search_terms:
            search_filter_bool = search_filter_bool | df['main'].str.lower().str.contains(term)

    return df[search_filter_bool]

# Ratio of roesti tweets


In [19]:

roesti_search = search_df(main_df, ["rösti", "roesti", "röschti", "roeschti"], search_exclusive=False)
roesti_tweets = roesti_search.groupby("geo_state")["main"].count()

total_tweets = main_df.groupby("geo_state")["main"].count()
(roesti_tweets / total_tweets).dropna().sort_values()

geo_state
Neuchâtel           0.000022
Ticino              0.000026
Geneva              0.000034
Basel-Landschaft    0.000046
Valais              0.000048
Basel-City          0.000059
Schaffhausen        0.000065
Vaud                0.000069
Schwyz              0.000073
Fribourg            0.000105
Thurgau             0.000115
Lucerne             0.000117
Zurich              0.000136
Aargau              0.000136
Grisons             0.000183
Bern                0.000246
Obwalden            0.000262
Saint Gallen        0.000332
Solothurn           0.000367
Name: main, dtype: float64

In [26]:
count_df = pd.DataFrame(roesti_tweets/total_tweets)
count_df.head()

Unnamed: 0_level_0,main
geo_state,Unnamed: 1_level_1
Aargau,0.000136
Appenzell Ausserrhoden,
Appenzell Innerrhoden,
Basel-City,5.9e-05
Basel-Landschaft,4.6e-05


In [27]:
def count_df(searched_df):
    """Propotion of tweets talking about a certain topic. Computation may be long. 
    `searched_df` must have been done using the whole dataset! (not only neutral)
    Returns a dataframe"""
    topic_tweets = searched_df.groupby("geo_state")["main"].count()
    total_tweets = main_df.groupby("geo_state")["main"].count()
    return pd.DataFrame(topic_tweets/total_tweets)

In [68]:
import pandas as pd
import numpy as np
import folium

from random import random

# ========== Tweet Selection ==========

def get_happy_sad_tweet(df, happy):
    """ Randomly selects a tweet from the happiest (`happy` = True) or saddest (`happy` = False) canton and returns it as a Serie"""
    # If df is empty return empty string
    if len(df) < 1:
        return ''
    if happy:
        selected_state = main_df_opinion.groupby("geo_state").mean().sort_values(by='sentiment', ascending=False).index[0]
    else:
        selected_state = main_df_opinion.groupby("geo_state").mean().sort_values(by='sentiment', ascending=False).index[-1]
    # getting indexes of tweets in the happiest/saddest state
    indexes = main_df_opinion[main_df_opinion['geo_state'] == selected_state]['main'].index
    # randomly selecting one of them
    indexes_list = list(indexes)
    random_index = int(random()*len(list(indexes)))
    tweet_selected_index = indexes_list[random_index]
    return main_df_opinion.loc[tweet_selected_index]


# ========== Sub-functions to filter data ==========

def filter_lang(df, langs):
    """ filter the df with one or several languages """
    return df[df['lang'].isin(langs)]

def filter_relevant_states(df, threshold):
    """ keep states that have more than `threshold` tweets """
    return df.groupby("geo_state").filter(lambda x: x.count()["main"] > threshold)

def append_weekday(df):
    df["weekday"] = df["published"].apply(lambda x: x.weekday())

def filter_weekday(df, days): #drop weekday after?
    return df[df['weekday'].isin(days)]

def filter_df(df, langs = ['en'], days = [0, 1, 2, 3, 4, 5, 6], threshold=0):
    """Filters the DataFrame according to language, weekdays, and threshold"""
    # Language filter
    df = filter_lang(df, langs)
    # Threshold filter if necessary
    if threshold > 0:
        df = filter_relevant_states(df, threshold)
    # Weekday filter
    append_weekday(df)
    df = filter_weekday(df, days)
    df = df.drop('weekday', 1)
    return df


# ========== Search & Count ==========


def search_df(df, search_terms, search_exclusive=False):
    """ Searches the df for either one of the terms. If `search_exclusive` is True, then searches the df for entries that have all terms """
    # Lowercase search terms
    search_terms = [t.lower() for t in search_terms]

    # Create a boolean array to subset the dataframe with search matching terms
    if search_exclusive:
        search_filter_bool = np.ones(len(df), dtype=bool)

        for term in search_terms:
            search_filter_bool = search_filter_bool & df['main'].str.lower().str.contains(term)
    else:
        search_filter_bool = np.zeros(len(df), dtype=bool)

        for term in search_terms:
            search_filter_bool = search_filter_bool | df['main'].str.lower().str.contains(term)

    return df[search_filter_bool]


def count_df(searched_df, main_df):
    """Propotion of tweets talking about a certain topic. Computation may be long. 
    `searched_df` must have been done using the whole dataset! (not only neutral)
    Returns a dataframe"""
    topic_tweets = searched_df.groupby("geo_state")["sentiment"].count()
    total_tweets = main_df.groupby("geo_state")["sentiment"].count()
    return pd.DataFrame(topic_tweets/total_tweets)



# ========== Map Generation ==========

def generate_folium(df, count=False):
    if not count:
        df_to_map = df.groupby("geo_state").mean()
    else:
        df_to_map = df
    
    df_to_map = append_state_code(df_to_map)

    geo_path = 'utils/ch-cantons.topojson.json'

    folium_map = folium.Map(location=[46.8, 8.2], zoom_start=8)
    
    print("if count")
    
    if not count:
        thresh = [-0.66, -0.33, 0, 0.33, 0.66]
        colors = 'RdYlGn'
        legend = 'Happineess level 2016 per state'

    else:
        thresh = [0, 0.01, 0.02] #try different stuff
        colors = 'BuPu'
        legend = 'Proportion of tweets'

    folium_map.choropleth(geo_path=geo_path,
                         data=df_to_map,
                         columns=['state_code', 'sentiment'],
                         key_on='feature.id',
                         topojson='objects.cantons',
                         threshold_scale=thresh,
                         fill_color=colors,
                         legend_name=legend
                        )
    return folium_map

def append_state_code(df):
    '''Adds state code in a new column, (semi-)in place'''

    state_to_code = {
        'Zurich': 'ZH',
        'Solothurn': 'SO',
        'Geneva': 'GE',
        'Lucerne': 'LU',
        'Thurgau': 'TG',
        'Jura': 'JU',
        'Grisons': 'GR',
        'Valais': 'VS',
        'Fribourg': 'FR',
        'Bern': 'BE',
        'Schaffhausen': 'SH',
        'Schwyz': 'SZ',
        'Vaud': 'VD',
        'Saint Gallen': 'SG',
        'Neuchâtel': 'NE',
        'Aargau': 'AG',
        'Ticino': 'TI',
        'Basel-City': 'BS',
        'Basel-Landschaft': 'BL',
        'Obwalden': 'OW',
        'Zug': 'ZG',
        'Uri': 'UR',
        'Glarus': 'GL',
        'Nidwalden': 'NW',
        'Appenzell Innerrhoden': 'AI',
        'Appenzell Ausserrhoden': 'AR'
    }

    print(df)
    df['state_code'] = [state_to_code[index] for index in df.index.values]

    # Create missing rows to match the json
    for state, state_code in state_to_code.items():
        if state_code not in df['state_code'].tolist():
            df = df.append(pd.Series({"state_code": state_code, "sentiment": 0}), ignore_index=True)
    print(df)
    return df


In [69]:
def serve_map(search_query):

    if search_query:
        df = search_df(opinion_df, search_query.split(" "), search_exclusive=False)
    else:
        df = opinion_df

    #df = filter_df(df, ['en'], [0, 1, 2])

    print("yoyoyyoo")
    df_count = count_df(df, opinion_df)
    #folium_map = generate_folium(df) 
    folium_map = generate_folium(df_count, count=True) 
    folium_map.save("app/maps/map-test-%s.html" % search_query)

    print(search_query)

In [70]:
opinion_df = main_df_opinion
serve_map('hello')

yoyoyyoo
                        sentiment
geo_state                        
Aargau                   0.002715
Appenzell Ausserrhoden   0.002203
Appenzell Innerrhoden    0.001361
Basel-City               0.000941
Basel-Landschaft         0.003200
Bern                     0.001683
Fribourg                 0.003900
Geneva                   0.002609
Glarus                   0.000873
Grisons                  0.002583
Jura                     0.001718
Lucerne                  0.001637
Neuchâtel                0.003020
Nidwalden                0.001661
Obwalden                 0.005192
Saint Gallen             0.002747
Schaffhausen             0.002931
Schwyz                   0.003357
Solothurn                0.012807
Thurgau                  0.002319
Ticino                   0.001935
Uri                      0.000346
Valais                   0.002793
Vaud                     0.004132
Zug                      0.004159
Zurich                   0.002348
                        sentiment state