In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import signal

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit

from statsmodels.tsa import arima_model
import statsmodels.api as sm

from bokeh.plotting import figure, output_notebook, show, reset_output
from bokeh.colors import RGB
from bokeh.models import ColumnDataSource, Slider, CustomJS, ColorBar, DateSlider
from bokeh.io import curdoc
from bokeh.layouts import widgetbox, column, gridplot
from bokeh.transform import linear_cmap
from bokeh.palettes import Spectral10, RdYlGn10

output_notebook()

c = sns.color_palette("colorblind")

from pyspark.sql import functions

import itertools

### spark

In [2]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.datastax.spark:spark-cassandra-connector_2.11:2.3.0 --conf spark.cassandra.connection.host=172.31.80.79,172.31.94.11,172.31.86.202 pyspark-shell'

In [3]:
from pyspark import SparkContext
sc = SparkContext("local", "movie lens app")
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [4]:
def load_and_get_table_df(keys_space_name, table_name):
    table_df = sqlContext.read\
        .format("org.apache.spark.sql.cassandra")\
        .options(table=table_name, keyspace=keys_space_name)\
        .load()
    return table_df

queryd = load_and_get_table_df('gdelt', 'queryd')

In [5]:
queryd.count()

7549807

### Python

In [6]:
from cassandra.cluster import Cluster

cluster = Cluster(['172.31.86.202'])  # provide contact points and port
session = cluster.connect('gdelt')
rows = session.execute('select * from queryd')

# 1. Requete D

* Groupement des données par mois

In [7]:
data_month = queryd.groupby("country1", "country2", "month").agg(functions.mean("mean_tone").alias("mean_tone"),\
                                                            functions.sum("n_articles").alias("n_articles"))

In [8]:
df_month = data_month.toPandas()

In [9]:
df_month.sort_values(by='n_articles', ascending=False)

Unnamed: 0,country1,country2,month,mean_tone,n_articles
708111,American,United States,5,-1.401431,433170
521186,American,United States,6,-1.419249,400265
464398,American,United States,4,-1.399124,398140
801986,American,United States,2,-1.233848,393248
586394,American,United States,3,-1.265355,383891
...,...,...,...,...,...
672838,French Republic,Republic Of Tunisia,3,1.446773,1
209704,Barbadian,New Zealander,4,2.597253,1
209707,Republic Of Ireland,Venezuelans,2,-2.222222,1
475740,Reunion,Singaporeans,2,4.231228,1


In [10]:
# remove (American, United States)
# remove (American, Americans)

data = df_month[( ((df_month.country1 + df_month.country2) != "AmericanAmericans") ) &
               
               ( ((df_month.country1 + df_month.country2) != "AmericaUnited States") ) &
                
               ( ((df_month.country1 + df_month.country2) != "AmericanUnited States") ) &
                
               ( ((df_month.country1 + df_month.country2) != "AmericansUnited States") ) &
                
               ( ((df_month.country1 + df_month.country2) != "AmericaAmericans") ) &
                
                ( ((df_month.country1 + df_month.country2) != "AmericaAmerican") ) &
                
               ( ((df_month.country1 + df_month.country2) != "ChinaChinese") )
                
               ].sort_values(by='n_articles', ascending=False)

In [11]:
data[data.month == 1].head()

Unnamed: 0,country1,country2,month,mean_tone,n_articles
608207,China,United States,1,-1.425263,164128
782384,Canada,United States,1,-0.964342,140018
516548,Russia,United States,1,-2.465218,114526
152572,Mexico,United States,1,-1.674815,112641
276338,United Kingdom,United States,1,-0.477418,111836


In [16]:
list_countries_pair = sorted(["Mexico", "American", "China", "France", "Russia", "Iran"])
n = len(list_countries_pair)
list_month = ["Janvier", "Fevrier", "Mars", "Avril", "Mai", "Juin"]

RdYlGn10_inv = RdYlGn10[::-1]

combinaison_contries = list(itertools.combinations(list_countries_pair, 2))
combinaison_contries = list(map(lambda x: (x[0], x[1]) if x[0] < x[1] else (x[1], x[0]), combinaison_contries))

df_all_month = data[( data.country1.isin([x[0] for x in combinaison_contries]) ) & data.country2.isin( [x[1] for x in combinaison_contries] )]

TOOLTIPS = [("country1 : ", "@country1"),
            ("country2 : ", "@country2"),
           ("n articles : ", "@n_articles_1"),
           ("mean tone : ", "@mean_tone_1")]

p = figure(x_range = list_countries_pair[:-1], y_range = list_countries_pair[1:], plot_width=1000, plot_height=650, tooltips=TOOLTIPS,
             title="Ton moyen (couleur) et nombre d'article (taille) par couple de pays",
             x_axis_label="Pays",
             y_axis_label="Pays"
          )

min_tone, max_tone = df_all_month.mean_tone.min(), df_all_month.mean_tone.max()
delta_tone = max_tone - min_tone

min_article, max_article = df_all_month.n_articles.min(), df_all_month.n_articles.max()
delta_article = max_article - min_article

x = [list_countries_pair[j] for j in range(n-1) for i in range(n-1)]
y = list_countries_pair[1:]*(n-1)

df_tones = pd.DataFrame({"country1" : x, "country2" : y})

for month in df_all_month.month.unique():
    df_current = df_all_month.loc[data.month == month]
    mean_tone_values = []
    n_articles_values = []
    colors = []
    sizes = []
    for country1 in list_countries_pair[:-1]:
        for country2 in list_countries_pair[1:]:
            try:
                mean_tone = df_current[ (df_current.country1 == country1) & (df_current.country2 == country2)].mean_tone.values[0]
                mean_tone_values.append(mean_tone)
                
                n_articles = df_current[ (df_current.country1 == country1) & (df_current.country2 == country2)].n_articles.values[0]
                n_articles_values.append(n_articles)
                
                colors.append (RdYlGn10_inv[int((mean_tone - min_tone)/delta_tone*9)])
                sizes.append (int((n_articles)/delta_article*100))
                
            except:
                mean_tone_values.append(0)
                n_articles_values.append(0)
                colors.append(RdYlGn10_inv[5])
                sizes.append(0)
    
    df_tones[f"color_{month}"] = colors
    df_tones[f"size_{month}"] = sizes
    df_tones[f"mean_tone_{month}"] = mean_tone_values
    df_tones[f"n_articles_{month}"] = n_articles_values
    

source = ColumnDataSource(data = df_tones)
r = p.circle(x="country1", y="country2", color="color_1", size="size_1", source=source)   
        
callback = CustomJS(args=dict(r=r, source=source, tt=p.hover), code=
    """
    var data = source.data;
    var i = cb_obj.value
    


    r.glyph.line_color.field = 'color_' + i
    r.glyph.fill_color.field = 'color_' + i
    
    r.glyph.size.field = 'size_' + i
    
    //var y_label = label.axis_label;
    //y_label.set("something else");

    r.glyph.change.emit()
    
    """)


        
slider = Slider(start=1, end=6, value=1, step=1, title='Mois')   
slider.js_on_change('value', callback)

mapper = linear_cmap(field_name='mean_tone', palette=RdYlGn10_inv ,low=min(df.mean_tone) ,high=max(df.mean_tone))
color_bar = ColorBar(color_mapper=mapper['transform'], width=8,  location=(0,0))

p.add_layout(color_bar, 'right')

l = gridplot([[column(slider)],[p]])

show(l)
