## Getting and saving the cleaned twitter data
* We load the cleaned dataframes saved in the twitter_cleaning notebook and do some further analysis. In particular, we tokenize the tweets and remove the stopwords so that we can do some analysis on word frequencies.
* The dataframe with tokenized teets is saved as 'twitter_data/twitter_forFreq.parquet' and can be loaded directly.

In [1]:
import pandas as pd
import os
import re
from tqdm import tqdm
from pyspark import SparkConf, SparkContext
import pyspark.sql
from pyspark.sql.functions import col, udf
import pyspark.sql.functions as func
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from itertools import chain
import nltk
from nltk import FreqDist
from nltk.tokenize import RegexpTokenizer

conf = SparkConf().setAppName("ADA-gcl")
sqlContext = SQLContext(sc)

## Load the paqueret data

In [1]:
path = 'file:///home/kirtan/Academics/EPFL/sem1/ADA/ADA-Project/twitter_data/final_twitter1.parquet'
df = sqlContext.read.parquet(path)
df = df.withColumn('canton', df.canton_)
df = df.withColumn('sentiment', df.sentiment_)
df = df.drop(df.canton_).drop(df.sentiment_)

In [7]:
df.show()

+-------------------+--------+--------------------+-------+-----------+-----+---+----------+---------+
|                 id|language|                main| gender|time_period|month|tmp|    canton|sentiment|
+-------------------+--------+--------------------+-------+-----------+-----+---+----------+---------+
|1451606943000012161|      en|@NaliniSingh we a...|UNKNOWN|      Night|    1|  1|      Vaud|        0|
|1451607568000005493|      en|If bae gets any s...| FEMALE|      Night|    1|  1|      Vaud|        0|
|1451609302000001820|      en|Listening to "Bre...|   MALE|      Night|    1|  1|    Zurich|        0|
|1451618447000009640|      en|@Strippin @dexbon...|UNKNOWN|      Night|    1|  1|      Vaud|        0|
|1451619941000014002|      en|@VictoriaJustice ...| FEMALE|      Night|    1|  1|      Vaud|        0|
|1451623493000004879|      en|10NL Folded flush...| FEMALE|      Night|    1|  1|      Vaud|        0|
|1451632164000001178|      en|@Iam_boika I love...| FEMALE|        Day|  

In [26]:
# Get hashtags
def get_hashtags(twt):
    return(re.findall(r"#(\w+)", twt))

udfTags = udf(get_hashtags, ArrayType(StringType()))
df = df.withColumn('tags', udfTags("main"))

In [122]:
# Get stopwords and clean main
stpWords = set(stopwords.words("english")).union(set(stopwords.words("french")).union(set(stopwords.words("german"))))
def clean_main(twt):
    tkns = re.sub("(@[A-Za-z0-9_]+)|([^A-Za-z \t])|(\w+:\/\/\S+)"," ",' '.join(twt)).split()
    tkns_l = [w.lower() for w in tkns]
    return([word for word in tkns_l if word not in stpWords])  

udfTokenize = udf(clean_main, ArrayType(StringType()))
df = df.withColumn('tweet', udfTokenize("main"))
df = df.drop(df.main).drop(id)

In [141]:
def concat(type):
    def concat_(*args):
        return list(chain(*args))
    return udf(concat_, ArrayType(type))

concat_string_arrays = concat(StringType())

df = df.withColumn('keywords', concat_string_arrays(col("tweet"), col("tags")))
df = df.drop(df.tags).drop(df.keywords)

In [144]:
df.show()

+--------+-------+-----------+-----+---+----------+---------+--------------------+
|language| gender|time_period|month|tmp|    canton|sentiment|               tweet|
+--------+-------+-----------+-----+---+----------+---------+--------------------+
|      en|UNKNOWN|      Night|    1|  1|      Vaud|        0|[anau, fishing, t...|
|      en| FEMALE|      Night|    1|  1|      Vaud|        0|[bae, gets, smoot...|
|      en|   MALE|      Night|    1|  1|    Zurich|        0|[listening, bread...|
|      en|UNKNOWN|      Night|    1|  1|      Vaud|        0|[jesus, didnt, re...|
|      en| FEMALE|      Night|    1|  1|      Vaud|        0|[vic, take, look,...|
|      en| FEMALE|      Night|    1|  1|      Vaud|        0|[nl, folded, flus...|
|      en| FEMALE|        Day|    1|  1|    Geneva|        1|        [love, papi]|
|      en|UNKNOWN|        Day|    1|  1|      Vaud|        0|[kanye, west, rel...|
|      en|   MALE|        Day|    1|  1|      Vaud|        0|[thanks, retweet,...|
|   

The dataframe botained above is saved as 'twitter_data/twitter_forFreq.parquet' so it can be loaded directly.

### Just read dataframe directly 

In [2]:
df = sqlContext.read.parquet('file:///home/kirtan/Academics/EPFL/sem1/ADA/ADA-Project/twitter_data/twitter_forFreq.parquet')

In [3]:
# Filter the dataframe by given word
def filter_df(df, keywords, column):
    def check_L1_in_L2(l2):
        for l in keywords:
            if l in l2:
                return True
        return False
    filt = udf(check_L1_in_L2, BooleanType())
    return(df.where(filt(col(column))))

# Aggregate dataframe by 'column' and calculate weighted mean of sentiment
def get_grouped(df, column):
    return(df.groupBy(column).agg((func.sum(df.sentiment)/func.sum(df.tmp)).alias('avg_sentiment'), func.sum(df.tmp).alias('count')))

In [41]:
get_grouped(df, 'canton').show()

+--------------------+-------------------+-------+
|              canton|      avg_sentiment|  count|
+--------------------+-------------------+-------+
|                Bern| 0.2026460148434979|  68178|
|             Lucerne| 0.2885554149340083|  11062|
|           Neuchâtel|0.26464646464646463|   3465|
|Appenzell Innerrh...| 0.1276595744680851|     47|
|        Schaffhausen| 0.3213728549141966|   1923|
|                Vaud|0.19293273025541155|1762293|
|              Ticino| 0.1091281351305238|  29305|
|                 Uri|0.23214285714285715|     56|
|              Valais|0.22400329871401695| 116409|
|                Jura|0.23194748358862144|    457|
|            Obwalden|0.13312693498452013|   1292|
|              Geneva| 0.1413494047751649| 583561|
|             Thurgau|0.19573400250941028|    797|
|           Nidwalden|            0.33125|    160|
|           Solothurn|0.28759493670886077|   3950|
|          Basel-City| 0.2411903027091475| 241552|
|              Zurich| 0.243113

In [5]:
# Filters the df by keywords from column tweet and gives average sentiment grouped by column
def get_final_df(df, keywords, column, grp_column):
    return(get_grouped(filter_df(df, keywords, column), grp_column))

In [11]:
# This looks for all the tweets which have the word 'federer' and groups by canton to 
# give the average sentiment for each canton.
get_final_df(df, ['federer'], 'tweet','canton').show()

+----------+--------------------+-----+
|    canton|       avg_sentiment|count|
+----------+--------------------+-----+
|      Bern|                 0.0|    4|
|   Lucerne|                 0.0|    1|
|      Vaud| 0.31266245822502786| 2693|
|    Ticino|  0.6666666666666666|    6|
|    Valais| 0.16666666666666666|   18|
|    Geneva|0.038461538461538464|   26|
| Solothurn|                -0.5|    2|
|Basel-City| 0.27835051546391754|   97|
|    Zurich| 0.15897435897435896|  195|
|  Fribourg|                 0.0|    5|
|    Aargau|                -1.0|    1|
|   Grisons|                 0.0|    2|
+----------+--------------------+-----+



In [20]:
# This looks for all the tweets which have either of the words in the array and then gives average sentiment by month.
get_final_df(df, ['india', 'inde'], 'tweet', 'month').show()

+-----+--------------------+-----+
|month|       avg_sentiment|count|
+-----+--------------------+-----+
|    1|0.011363636363636364|  264|
|    2| 0.05573248407643312|  628|
|    3| 0.04885496183206107|  655|
|    4| 0.04983388704318937|  602|
|    5| 0.06732673267326733|  505|
|    6|0.053061224489795916|  490|
|    7| 0.08074534161490683|  644|
|    8| 0.13218884120171673| 1165|
|    9| 0.13898026315789475| 1216|
|   10| 0.16253968253968254| 1575|
+-----+--------------------+-----+

