# Die Filterblasen der Anderen?

In [1]:
from datetime import datetime, timedelta
import pytz
import operator
from eigenmodule import *
import pandas as pd
import plotly.plotly as py
import numpy as np
import plotly.graph_objs as go
import cufflinks as cf
import config
import plotly.tools as tls
tls.set_credentials_file(username=config.username, api_key=config.api_key)
from IPython.display import HTML
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

### 1. Anwendung

Dies ist eine Anwendung zum Testen der Visualisierungskapazitäten der Python-Bibliothek 'plotly' basiernd auf Daten gewonnen mithilfe der Python-Bibliothek 'tweepy'.


### 2. Daten

#### 2.1 Beschreibung

Grundlage der Daten sind ca. 4000 Twitter-Accounts, eingeteilt in vier Gruppen : Die erste Gruppe basiert auf einer Liste politisch konservativ bis extrem konservativ eingestellter Accounts zusammengetragen im Rahmen der 'Reconquista Internet'-Aktion Jan Böhmermanns. Aufgenommen in die Gruppe wurden die tausend Accounts mit den meisten Followern. Die drei anderen Gruppen basieren jeweils auf tausend Followern der Accounts von 'Spiegel Online', 'Taz - Die Tageszeitung', und der 'Identitäten Bewegung Österreich'.

Über die Twitter-API wurden die Tweets dieser vier Gruppen im Zeitraum vom 19. August bis zum 6. September erfasst. Dieser Zeitraum wurde gewählt aufgrund der sich gleichzeitig in Chemnitz entfaltenden Ereignisse.

#### 2.2 Problematik

Vor allem in den Gruppen 2 bis 4 finden sich viele Nutzer, deren Accounts privat geschaltet sind und daher über die Twitter-API nicht verfügbar sind. Weiterhin sind manche Tweets durch Löschung nicht mehr verfügbar. Außerdem wurde nicht überprüft, ob hinter den Accounts Bots stehen oder nicht. Für die Darstellung ist dies allerdings zunächst vernachlässigbar. Weiterhin variiert das Tweet-Verhalten stark zwischen Nutzern, vor allem zwischen 'professionellen' und 'privaten'. Grundsätzlich ist in Frage zu stellen, wie repräsentativ Twitter als Datengrundlage ist.


### 3. Inhaltliches Ziel

Die Anwendung versucht, das Twitter-Verhalten der ersten Gruppe, der 'Böhermann-Liste', nachzuzeichnen. Zum Vergleich werden die anderen drei als Kontrollgruppen herangezogen.


### 4. Methodik

Visualisiert werden die von den jeweiligen Gruppen verwendeten Hashtags im Zeitraum vom 19. August bis zum 6. September. Die Darstellung wurde beschränkt auf die Hashtags, die an einem der Tage in diesem Zeitraum entweder der am häufigsten oder zweithäufigsten innerhalb dieser Gruppen verwendet wurden. Diese wurden der Anzahl nach auf einem Zeitstrahl als Liniendiagramm dargestellt.

Aufgrund der außergewöhnlich hohen Zahl von Tweets unter dem Hashtag 'Chemnitz' wurden außerdem die Volltexte dieses Subsets von Tweets einbezogen. Disen wurden nach den am häufigsten verwendeten Wörtern und den am häufigsten verwendeten Adjektiv-Subjekt Kombinationen visualisiert.


### 5. Visualisierung

In [3]:
hashtag_dates = []
hashtags_dict = {}

with open("projektdaten/boehmermann_1000_hashtags_dated.txt", "r", encoding="utf-8") as file:
    file_name = str(file).strip("projektdaten/")
    file_name = file_name.strip("_hashtags_dated.txt")
    for line in file:
        line_stripped = line.strip("\ufeff")
        line_stripped = line_stripped.strip("\n")
        line_splitted = line_stripped.split("|")
        line_1_stripped = line_splitted[1].strip("\'")
        line_1_stripped = line_1_stripped.strip(" ")
        line_1_stripped = line_1_stripped.strip("[")
        line_1_stripped = line_1_stripped.strip("]")
        hashtags = line_1_stripped.split(",")

        if line_splitted[0] not in hashtags_dict.keys():
            hashtags_dict[line_splitted[0]] = hashtags
        else:
            hashtags_dict[line_splitted[0]].extend(hashtags)

def removekey(d, key):
    r = dict(d)
    del r[key]
    return r

datelist = pd.date_range(pd.datetime.today()- timedelta(days=30), periods=30).tolist()
datestrings = []
for element in datelist:
    datestrings.append(str(element.date()))


hashtags_dict_new = {}

for element in datestrings:
    try:
        hashtags_dict_new[element] = hashtags_dict[element]
    except KeyError:
        continue
        

        
        

            
            
hashtags_dict_counted = {}
hashtags_dict_sorted = {}

for key, value in hashtags_dict_new.items():
    hashtags_dict_counted[key] = useful.count_occurrences(value)
    
    
    
key_list = []

for key in hashtags_dict_counted.keys():
    key_list.append(key)
    
key_list = sorted(key_list)



hashtag_list = []

for element in key_list:
    hashtags_dict_sorted = sorted(hashtags_dict_counted[element].items(), key=operator.itemgetter(1), reverse=True)
    if len(hashtags_dict_sorted) > 2:
        for x in range(0, 2):
            hashtag_list.append(hashtags_dict_sorted[x][0])
    else:
        for x in range(0, len(hashtags_dict_sorted)):
            hashtag_list.append(hashtags_dict_sorted[x][0])
        
        
hashtag_set = sorted(set(hashtag_list))



hashtag_df = pd.DataFrame({"Dates":key_list})


for hashtag in hashtag_set:
    hashtag_value_list = []
    for element in key_list:
        try:
            hashtag_value_list.append(hashtags_dict_counted[element][hashtag])
        except KeyError:
            hashtag_value_list.append(0)
    hashtag_df[hashtag] = pd.Series(hashtag_value_list, index=hashtag_df.index)
    
hashtag_df_index = hashtag_df.set_index(["Dates"]).sort_index()

#hashtag_df.to_csv(r'boehmermann_hashtag_df.txt', header=True, index=None, sep='|', mode='w', encoding="utf-8")

#### 5.1 Gruppe 1: 'Böhmermann-Liste'

In [4]:
layout = go.Layout(
    title='Am häufigsten verwendete Hashtags'
)

hashtag_df.iplot(x='Dates',kind='scatter', layout = layout)

In [5]:
phrases = []
temp_values = []
values = []

phrase_dict = {}


with open("bar_charts/boehmermann_chemnitz_tweets_phrase_count.txt", "r", encoding="utf-8") as file:
    for line in file:
        line_stripped = line.strip("\n")
        line_splitted = line_stripped.split("|")
        temp_values.append(int(line_splitted[1]))
        phrase_dict[line_splitted[0]] = int(line_splitted[1])
        
#((sum(temp_values)/len(temp_values)) * 2)      
        
for key, value in phrase_dict.items():
    if value > ((sum(temp_values)/len(temp_values)) * 2):
        phrases.append(key)
        values.append(value)

layout = go.Layout(
    title='Am häufigsten verwendete Adjektiv-Subjektiv Kombinationen in #Chemnitz-Tweets'
)        
        
data = [go.Bar(
            x=phrases,
            y=values
    )]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [6]:
phrases = []
temp_values = []
values = []

phrase_dict = {}


with open("bar_charts/boehmermann_chemnitz_tweets_word_count.txt", "r", encoding="utf-8") as file:
    for line in file:
        line_stripped = line.strip("\n")
        line_splitted = line_stripped.split("|")
        temp_values.append(int(line_splitted[1]))
        phrase_dict[line_splitted[0]] = int(line_splitted[1])
        
#((sum(temp_values)/len(temp_values)) * 2)      
        
for key, value in phrase_dict.items():
    if value > ((sum(temp_values)/len(temp_values)) * 2):
        phrases.append(key)
        values.append(value)

layout = go.Layout(
    title='Am häufigsten verwendete Wörter in #Chemnitz-Tweets'
)        
        
data = [go.Bar(
            x=phrases,
            y=values
    )]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [7]:
hashtag_dates = []
hashtags_dict = {}

with open("projektdaten/ib_liste_ueber_1000_hashtags_dated.txt", "r", encoding="utf-8") as file:
    file_name = str(file).strip("projektdaten/")
    file_name = file_name.strip("_hashtags_dated.txt")
    for line in file:
        line_stripped = line.strip("\ufeff")
        line_stripped = line_stripped.strip("\n")
        line_splitted = line_stripped.split("|")
        line_1_stripped = line_splitted[1].strip("\'")
        line_1_stripped = line_1_stripped.strip(" ")
        line_1_stripped = line_1_stripped.strip("[")
        line_1_stripped = line_1_stripped.strip("]")
        hashtags = line_1_stripped.split(",")

        if line_splitted[0] not in hashtags_dict.keys():
            hashtags_dict[line_splitted[0]] = hashtags
        else:
            hashtags_dict[line_splitted[0]].extend(hashtags)

def removekey(d, key):
    r = dict(d)
    del r[key]
    return r

datelist = pd.date_range(pd.datetime.today()- timedelta(days=30), periods=30).tolist()
datestrings = []
for element in datelist:
    datestrings.append(str(element.date()))


hashtags_dict_new = {}

for element in datestrings:
    try:
        hashtags_dict_new[element] = hashtags_dict[element]
    except KeyError:
        continue
        

        
        

            
            
hashtags_dict_counted = {}
hashtags_dict_sorted = {}

for key, value in hashtags_dict_new.items():
    hashtags_dict_counted[key] = useful.count_occurrences(value)
    
    
    
key_list = []

for key in hashtags_dict_counted.keys():
    key_list.append(key)
    
key_list = sorted(key_list)



hashtag_list = []

for element in key_list:
    hashtags_dict_sorted = sorted(hashtags_dict_counted[element].items(), key=operator.itemgetter(1), reverse=True)
    if len(hashtags_dict_sorted) > 2:
        for x in range(0, 2):
            hashtag_list.append(hashtags_dict_sorted[x][0])
    else:
        for x in range(0, len(hashtags_dict_sorted)):
            hashtag_list.append(hashtags_dict_sorted[x][0])
        
        
hashtag_set = sorted(set(hashtag_list))



hashtag_df = pd.DataFrame({"Dates":key_list})


for hashtag in hashtag_set:
    hashtag_value_list = []
    for element in key_list:
        try:
            hashtag_value_list.append(hashtags_dict_counted[element][hashtag])
        except KeyError:
            hashtag_value_list.append(0)
    hashtag_df[hashtag] = pd.Series(hashtag_value_list, index=hashtag_df.index)
    
    
#hashtag_df.to_csv(r'ib_hashtag_df.txt', header=True, index=None, sep='|', mode='w', encoding="utf-8")

hashtag_df_index = hashtag_df.set_index(["Dates"]).sort_index()

#hashtag_df.to_csv(r'boehmermann_hashtag_df.txt', header=True, index=None, sep='|', mode='w', encoding="utf-8")

#### 5.2 Gruppe 2: 'Identitäre Bewegung Österreich'

In [8]:
layout = go.Layout(
    title='Am häufigsten verwendete Hashtags'
)

hashtag_df.iplot(x='Dates',kind='scatter', layout = layout)

In [9]:
phrases = []
temp_values = []
values = []

phrase_dict = {}


with open("bar_charts/ib_chemnitz_tweets_phrase_count.txt", "r", encoding="utf-8") as file:
    for line in file:
        line_stripped = line.strip("\n")
        line_splitted = line_stripped.split("|")
        temp_values.append(int(line_splitted[1]))
        phrase_dict[line_splitted[0]] = int(line_splitted[1])
        
#((sum(temp_values)/len(temp_values)) * 2)      
        
for key, value in phrase_dict.items():
    if value > ((sum(temp_values)/len(temp_values)) * 2):
        phrases.append(key)
        values.append(value)

layout = go.Layout(
    title='Am häufigsten verwendete Adjektiv-Subjektiv Kombinationen in #Chemnitz-Tweets'
)        
        
data = [go.Bar(
            x=phrases,
            y=values
    )]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [10]:
phrases = []
temp_values = []
values = []

phrase_dict = {}


with open("bar_charts/ib_chemnitz_tweets_word_count.txt", "r", encoding="utf-8") as file:
    for line in file:
        line_stripped = line.strip("\n")
        line_splitted = line_stripped.split("|")
        temp_values.append(int(line_splitted[1]))
        phrase_dict[line_splitted[0]] = int(line_splitted[1])
        
#((sum(temp_values)/len(temp_values)) * 2)      
        
for key, value in phrase_dict.items():
    if value > ((sum(temp_values)/len(temp_values)) * 2):
        phrases.append(key)
        values.append(value)

layout = go.Layout(
    title='Am häufigsten verwendete Wörter in #Chemnitz-Tweets'
)        
        
data = [go.Bar(
            x=phrases,
            y=values
    )]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [11]:
hashtag_dates = []
hashtags_dict = {}

with open("projektdaten/taz_1000_hashtags_dated.txt", "r", encoding="utf-8") as file:
    file_name = str(file).strip("projektdaten/")
    file_name = file_name.strip("_hashtags_dated.txt")
    for line in file:
        line_stripped = line.strip("\ufeff")
        line_stripped = line_stripped.strip("\n")
        line_splitted = line_stripped.split("|")
        line_1_stripped = line_splitted[1].strip("\'")
        line_1_stripped = line_1_stripped.strip(" ")
        line_1_stripped = line_1_stripped.strip("[")
        line_1_stripped = line_1_stripped.strip("]")
        hashtags = line_1_stripped.split(",")

        if line_splitted[0] not in hashtags_dict.keys():
            hashtags_dict[line_splitted[0]] = hashtags
        else:
            hashtags_dict[line_splitted[0]].extend(hashtags)

def removekey(d, key):
    r = dict(d)
    del r[key]
    return r

datelist = pd.date_range(pd.datetime.today()- timedelta(days=30), periods=30).tolist()
datestrings = []
for element in datelist:
    datestrings.append(str(element.date()))


hashtags_dict_new = {}

for element in datestrings:
    try:
        hashtags_dict_new[element] = hashtags_dict[element]
    except KeyError:
        continue
        

        
        

            
            
hashtags_dict_counted = {}
hashtags_dict_sorted = {}

for key, value in hashtags_dict_new.items():
    hashtags_dict_counted[key] = useful.count_occurrences(value)
    
    
    
key_list = []

for key in hashtags_dict_counted.keys():
    key_list.append(key)
    
key_list = sorted(key_list)



hashtag_list = []

for element in key_list:
    hashtags_dict_sorted = sorted(hashtags_dict_counted[element].items(), key=operator.itemgetter(1), reverse=True)
    if len(hashtags_dict_sorted) > 2:
        for x in range(0, 2):
            hashtag_list.append(hashtags_dict_sorted[x][0])
    else:
        for x in range(0, len(hashtags_dict_sorted)):
            hashtag_list.append(hashtags_dict_sorted[x][0])
        
        
hashtag_set = sorted(set(hashtag_list))



hashtag_df = pd.DataFrame({"Dates":key_list})


for hashtag in hashtag_set:
    hashtag_value_list = []
    for element in key_list:
        try:
            hashtag_value_list.append(hashtags_dict_counted[element][hashtag])
        except KeyError:
            hashtag_value_list.append(0)
    hashtag_df[hashtag] = pd.Series(hashtag_value_list, index=hashtag_df.index)
    
    
#hashtag_df.to_csv(r'ib_hashtag_df.txt', header=True, index=None, sep='|', mode='w', encoding="utf-8")

hashtag_df_index = hashtag_df.set_index(["Dates"]).sort_index()

#hashtag_df.to_csv(r'boehmermann_hashtag_df.txt', header=True, index=None, sep='|', mode='w', encoding="utf-8")

#### 5.3 Gruppe 3: 'Taz - Die Tageszeitung'

In [12]:
layout = go.Layout(
    title='Am häufigsten verwendete Hashtags'
)

hashtag_df.iplot(x='Dates',kind='scatter', layout = layout)

In [13]:
phrases = []
temp_values = []
values = []

phrase_dict = {}


with open("bar_charts/taz_chemnitz_tweets_phrase_count.txt", "r", encoding="utf-8") as file:
    for line in file:
        line_stripped = line.strip("\n")
        line_splitted = line_stripped.split("|")
        temp_values.append(int(line_splitted[1]))
        phrase_dict[line_splitted[0]] = int(line_splitted[1])
        
#((sum(temp_values)/len(temp_values)) * 2)      
        
for key, value in phrase_dict.items():
    if value > ((sum(temp_values)/len(temp_values)) * 2):
        phrases.append(key)
        values.append(value)

layout = go.Layout(
    title='Am häufigsten verwendete Adjektiv-Subjektiv-Kombinationen in #Chemnitz-Tweets'
)        
        
data = [go.Bar(
            x=phrases,
            y=values
    )]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [14]:
phrases = []
temp_values = []
values = []

phrase_dict = {}


with open("bar_charts/taz_chemnitz_tweets_word_count.txt", "r", encoding="utf-8") as file:
    for line in file:
        line_stripped = line.strip("\n")
        line_splitted = line_stripped.split("|")
        temp_values.append(int(line_splitted[1]))
        phrase_dict[line_splitted[0]] = int(line_splitted[1])
        
#((sum(temp_values)/len(temp_values)) * 2)      
        
for key, value in phrase_dict.items():
    if value > ((sum(temp_values)/len(temp_values)) * 2):
        phrases.append(key)
        values.append(value)

layout = go.Layout(
    title='Am häufigsten verwendete Wörter in #Chemnitz-Tweets'
)        
        
data = [go.Bar(
            x=phrases,
            y=values
    )]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [15]:
hashtag_dates = []
hashtags_dict = {}

with open("projektdaten/spon_1000_hashtags_dated.txt", "r", encoding="utf-8") as file:
    file_name = str(file).strip("projektdaten/")
    file_name = file_name.strip("_hashtags_dated.txt")
    for line in file:
        line_stripped = line.strip("\ufeff")
        line_stripped = line_stripped.strip("\n")
        line_splitted = line_stripped.split("|")
        line_1_stripped = line_splitted[1].strip("\'")
        line_1_stripped = line_1_stripped.strip(" ")
        line_1_stripped = line_1_stripped.strip("[")
        line_1_stripped = line_1_stripped.strip("]")
        hashtags = line_1_stripped.split(",")

        if line_splitted[0] not in hashtags_dict.keys():
            hashtags_dict[line_splitted[0]] = hashtags
        else:
            hashtags_dict[line_splitted[0]].extend(hashtags)

def removekey(d, key):
    r = dict(d)
    del r[key]
    return r

datelist = pd.date_range(pd.datetime.today()- timedelta(days=30), periods=30).tolist()
datestrings = []
for element in datelist:
    datestrings.append(str(element.date()))


hashtags_dict_new = {}

for element in datestrings:
    try:
        hashtags_dict_new[element] = hashtags_dict[element]
    except KeyError:
        continue
        

        
        

            
            
hashtags_dict_counted = {}
hashtags_dict_sorted = {}

for key, value in hashtags_dict_new.items():
    hashtags_dict_counted[key] = useful.count_occurrences(value)
    
    
    
key_list = []

for key in hashtags_dict_counted.keys():
    key_list.append(key)
    
key_list = sorted(key_list)



hashtag_list = []

for element in key_list:
    hashtags_dict_sorted = sorted(hashtags_dict_counted[element].items(), key=operator.itemgetter(1), reverse=True)
    if len(hashtags_dict_sorted) > 2:
        for x in range(0, 2):
            hashtag_list.append(hashtags_dict_sorted[x][0])
    else:
        for x in range(0, len(hashtags_dict_sorted)):
            hashtag_list.append(hashtags_dict_sorted[x][0])
        
        
hashtag_set = sorted(set(hashtag_list))



hashtag_df = pd.DataFrame({"Dates":key_list})


for hashtag in hashtag_set:
    hashtag_value_list = []
    for element in key_list:
        try:
            hashtag_value_list.append(hashtags_dict_counted[element][hashtag])
        except KeyError:
            hashtag_value_list.append(0)
    hashtag_df[hashtag] = pd.Series(hashtag_value_list, index=hashtag_df.index)
    
    
#hashtag_df.to_csv(r'ib_hashtag_df.txt', header=True, index=None, sep='|', mode='w', encoding="utf-8")

hashtag_df_index = hashtag_df.set_index(["Dates"]).sort_index()

#hashtag_df.to_csv(r'boehmermann_hashtag_df.txt', header=True, index=None, sep='|', mode='w', encoding="utf-8")

#### 5.4 Gruppe 4: 'Spiegel Online'

In [16]:
layout = go.Layout(
    title='Am häufigsten verwendete Hashtags'
)

hashtag_df.iplot(x='Dates',kind='scatter', layout = layout)

In [17]:
phrases = []
temp_values = []
values = []

phrase_dict = {}


with open("bar_charts/spon_chemnitz_tweets_phrase_count.txt", "r", encoding="utf-8") as file:
    for line in file:
        line_stripped = line.strip("\n")
        line_splitted = line_stripped.split("|")
        temp_values.append(int(line_splitted[1]))
        phrase_dict[line_splitted[0]] = int(line_splitted[1])
        
#((sum(temp_values)/len(temp_values)) * 2)      
        
for key, value in phrase_dict.items():
    if value > ((sum(temp_values)/len(temp_values)) * 2):
        phrases.append(key)
        values.append(value)

layout = go.Layout(
    title='Am häufigsten verwendete Adjektiv-Subjektiv-Kombinationen in #Chemnitz-Tweets'
)        
        
data = [go.Bar(
            x=phrases,
            y=values
    )]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [18]:
phrases = []
temp_values = []
values = []

phrase_dict = {}


with open("bar_charts/spon_chemnitz_tweets_word_count.txt", "r", encoding="utf-8") as file:
    for line in file:
        line_stripped = line.strip("\n")
        line_splitted = line_stripped.split("|")
        temp_values.append(int(line_splitted[1]))
        phrase_dict[line_splitted[0]] = int(line_splitted[1])
        
#((sum(temp_values)/len(temp_values)) * 2)      
        
for key, value in phrase_dict.items():
    if value > ((sum(temp_values)/len(temp_values)) * 2):
        phrases.append(key)
        values.append(value)

layout = go.Layout(
    title='Am häufigsten verwendete Wörter in #Chemnitz-Tweets'
)        
        
data = [go.Bar(
            x=phrases,
            y=values
    )]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

### 6.Fazit 

Die Visualisierung funktioniert erwartungsgemäß. Um die Darstellung analyiseren zu können muss die Fragestellung genauer definiert und theorethisch unterfüttert werden. 'Chemnitz' als Hashtag ist allerdings in allen vier Gruppen außerordentlich prominent vertreten.