In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import nltk, fasttext, smart_open

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

from IPython.core.display import HTML

from wefe.query import Query
from wefe.word_embedding_model import WordEmbeddingModel
from wefe.metrics.WEAT import WEAT
from wefe.metrics.RND import RND

In [None]:
#load the data
fb_data = pd.read_csv("facebook_posts.csv", engine='python', encoding = 'utf-8')

In [None]:
##############################################
############### PREPROCESSING ################
##############################################

fb_data ["Message_proc"] = fb_data["Message"].astype(str) 

#remove html markup
text = text.str.replace('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});',' ')

#remove tabs
text= text.str.replace('[ |\t]{2,}', ' ', case=False)

#remove urls 
text= text.str.replace('http\S+|www.\S+', ' ', case=False)

#remove e-mails
text= text.str.replace('\\S+@\\S+', ' ', case=False)

#remove mentions
text= text.str.replace('@\\S+', ' ', case=False)

#remove hashtags
text= text.str.replace('#\\S+', ' ', case=False)

#remove additional links
text= text.str.replace('bit\\.ly\\S+', ' ', case=False)

#save the texts for qualitative part of the analysis
fb_data['Message_nice'] = text

#remove characters after semicolon (e.g. TV:n)
text= text.str.replace('\\:[A-Za-z]{1}', ' ', case=False)

#split tokens with "-"
text= text.str.replace('-', ' ')

#lowercase
text = text.str.lower()

#remove numbers and punctuation
text = text.str.replace('[^a-zåöä ]',' ').str.replace(' +',' ').str.strip()

In [None]:
#remove stop words

with open('stopwords_sv.txt', encoding = 'utf-8') as f: stop_words = f.read()

stop_words = stop_words.split()

customary_stop_words = ["osv", "pga", "the", "dvs", 
                        "iaf", "iom", "etc", "hej",
                        "of", "obs", "iofs", "bl", "bla",
                        "sen", "hos", "via", "kl", "både",
                        "mm", "per", "ex", "ca", "tex", 
                        "to", "and", "for", "tom", "sej", 
                        "dej", "mej", "mfl", "dom", "haha",
                        "truncated", "oxå", "vet", "tycker",
                        "tror", "ex", "http", "https", "www",
                        "a", "b", "c", "d", "e", "f", "g", 
                        "h", "i", "j", "k", "l", "m", "n",
                        "o", "p", "q", "r", "s", "t", "u",
                        "w", "x", "y", "z", "å", "ö", "ä"]

stop_words = set(stop_words + customary_stop_words)

text=text.apply(lambda x: [item for item in x.split() if item not in stop_words])

#remove very short messages
fb_data['Message_proc'] = text
fb_data["WordsCount"]=fb_data['Message_proc'].apply(lambda x: len(x))
fb_data = fb_data[fb_data["WordsCount"]>= 5]

#merge lists of tokens into strings
fb_data['proc_tokens'] = [' '.join(map(str, l)) for l in fb_data['Message_proc']]

#remove duplicated messages one more time
fb_data.drop_duplicates(subset=['proc_tokens'],keep='first',inplace=True)

In [None]:
#remove texts in languages other than Swedish

#load the model
fasttext_model = fasttext.load_model('lid.176.ftz')

docs = fb_data['proc_tokens'].to_list()

#run the model
docs_lang_fasttext = list(map(fasttext_model.predict, docs)) 

#extract labels
docs_lang_fasttext_labels = [x[0] for x in docs_lang_fasttext]

docs_lang_fasttext_labels = [''.join(x) for x in docs_lang_fasttext_labels] 
docs_lang_fasttext_labels = [x[-2:] for x in docs_lang_fasttext_labels]

#remove non-swedish documents
fb_data['fasttext'] = docs_lang_fasttext_labels

fb_data = fb_data[(fb_data['fasttext'] == "sv" )]

fb_data = fb_data.reset_index(drop = True)

text = fb_data['Message_proc']

In [None]:
##############################################
############### DOC2VEC MODEL ################
##############################################

smart_open.open = smart_open.smart_open

docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(text)]

#train the model with the best parameters based on the QVEC-CCA scores
model = Doc2Vec(vector_size=300, min_count=5, epochs=20, dm= 0, window = 8, dbow_words=1)
model.build_vocab(docs)
model.train(docs, total_examples=model.corpus_count, epochs=model.epochs)

#save the model to file
fname = get_tmpfile("my_doc2vec_model")
model.save(fname)

#model = Doc2Vec.load('my_doc2vec_model')

In [None]:
##############################################
###############   RND METRIC  ################
##############################################

#compile neutral word lists
economy = ['skattebetalare', 'arbetslöshet', 'bidrag', 'arbetskraft', 'välfärd', 'jobb', 'arbete', 'skatter','löner',
           'socialbidrag', 'kostnad', 'ekonomi', 'anställning', 'företag', 'arbetsförmedlingen', 'tjänst','vinst', 
           'förlust', 'pengar','medel', 'sysselsättning', 'låglönejobb', 'arbetslösa', 'lönsam', 'kostsam', 'pensioner',
           'pensionärer', 'försörja', 'småföretagare', 'bnp', 'försörjning', 'finansiera','arbetsplatser', 'industri']

threat = ['kriminalitet','kriminella', 'illegala', 'olagligt', 'terrorister', 'brott', 'våld', 'isis', 'anfall',
          'polis', 'våldsam', 'ungdomsgäng', 'förstöra', 'skada', 'angripa', 'lagförbrytare', 'straff', 'farlig',
          'våldtäktsmän', 'våldtäckter', 'brottslingar', 'brottslighet', 'kriminalitet', 'småkriminella', 'angripa',
          'hot', 'hotande', 'angrepp', 'förstöra', 'skador', 'rädd', 'rädsla', 'risk', 'utvisning']

solidarity = ['stödja', 'hjälpa', 'stöd', 'hjälp', 'solidaritet', 'medmänsklighet', 'medmänniskor', 'etablering', 
              'nöd', 'utsatta', 'skydd', 'mångfald', 'mottagande', 'inkludera', 'integrera', 'mångkultur',
              'skyddsbehövande', 'drabbade', 'hjälpbehövande', 'nödlidande', 'inkludering', 'välkommen',
              'integration', 'stötta', 'gemensam', 'gemenskap', 'medkänsla', 'sympati', 'deltagande']

#compile the "context" word lists 

refugees = ['flykting', 'flyktingar', 'flyktingen', 'flyktingarna', 'asylsökande', 'asylsökanden', 'asylsökandena']

immigrants = ['migrant', 'migranter', 'migranten', 'migranterna', 'invandrare', 'invandraren', 'invandarna',
              'immigrant', 'immigranter', 'immigranten', 'immigranterna']

wefe_model = WordEmbeddingModel(model.wv)
rnd = RND()

#run the queries
query_ec = Query([refugees, immigrants], [economy])
result_ec = rnd.run_query(query_ec, wefe_model)

query_th = Query([refugees, immigrants], [threat])
result_th = rnd.run_query(query_th, wefe_model)

query_so = Query([refugees, immigrants], [solidarity])
result_so = rnd.run_query(query_so, wefe_model)

#print the results
print(pd.DataFrame(result_ec, "\n", pd.DataFrame(result_th), "\n", pd.DataFrame(result_so))

In [None]:
##############################################
############## SEMANTIC NETWORK ##############
##############################################

target = ['flyktingar', 'invandrare']

#extract the words closest to the target labels using cosine similarity measure
edges = []    
for i in range (0, len(target)):
    x = pd.DataFrame(model.wv.most_similar(target[i], topn = 30), columns = ["target", "similarity"])
    x['source']= target[i]
    x = x[x['similarity']>0.55]
    x = x.drop (columns = 'similarity')
    edges.append(x) 

edges_df = pd.concat(edges)
a=edges_df.target.unique().tolist()
b=edges_df.source.unique().tolist()
vertices = a+b

#extract the words closest to the target labels' closest neigbour words 
edges_context = []    
for i in range (0, len(vertices)):
    x = pd.DataFrame(model.wv.most_similar(vertices[i], topn = 30), columns = ["target", "similarity"])
    x['source']= vertices[i]
    x = x[x['similarity']>0.55]
    x = x.drop (columns = 'similarity')
    edges_context.append(x) 

edges_context_df = pd.concat(edges_context)

#compile a complete edge list
edges_context_df = pd.concat ([edges_context_df, edges_df])

#save the swedish version of the edge list
edges_context_df.to_csv('edges_context_df.csv')

#upload the translated version of the edge list
edges_context_list = pd.read_csv("edges_context_list_eng.csv", engine='python', encoding = 'utf-8')
edges_context_list = edges_context_list.drop(columns = 'N')

#create vertex labels
vertices_labels =  pd.unique(edges_context_list[[" source", " target"]].values.ravel()).tolist()
vertices_labels = {a : a for a in vertices_labels}

edges_context_list = edges_context_list.to_records(index=False)

#create a network object
net = nx.Graph()
net.add_edges_from(edges_context_list)

#create a colour map for the target labels and context words
color_map = []
for node in net3:
    if node == ' immigrants(2)' or node == ' refugees':
        color_map.append('#fc9272')
    else: 
        color_map.append('lightblue')  

pos = nx.kamada_kawai_layout(net, scale = 5)
plt.figure(10, figsize=(20,15)) 

nx.draw_networkx_nodes(net, pos, node_color=color_map, node_size = 800)
nx.draw_networkx_edges(net, pos, width=1.0, alpha=0.5)
nx.draw_networkx_labels(net, pos, labels=vertices_labels) 

plt.axis("off")
plt.draw() 
#plt.savefig('sem_net.jpg')


In [None]:
##############################################
########### QUALITATIVE ANALYSIS #############
##############################################

#identify documents located closest to the target labels

similar_docs_i = model.docvecs.most_similar(positive=[model.wv['invandrare']], topn = 50)
similar_docs_f = model.docvecs.most_similar(positive=[model.wv['flytkingar']], topn = 50)

inds_i = [i[0] for i in similar_docs_i]
inds_f = [i[0] for i in similar_docs_f]

display(HTML(fb_data['Message_nice'][fb_data.index.isin(inds_i)].to_frame().to_html()))
display(HTML(fb_data['Message_nice'][fb_data.index.isin(inds_f)].to_frame().to_html()))

In [None]:
##############################################
############# DOCUMENT CLUSTERS ##############
##############################################

#determine the optimal number of clusters

kmeans_kwargs = {
       "init": "random",
       "n_init": 10,
       "max_iter": 300,
       "random_state": 180404,
   }

#use the elbow method
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(model.docvecs.vectors_docs)
    sse.append(kmeans.inertia_)

#plot the results
plt.style.use("fivethirtyeight")
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("The optimal number of clusters")
plt.ylabel("SSE")
plt.show()
    

In [None]:
#identify the optimal number of clusters analytically

#choose the number of clusters to be evaluated
n_clus = [6, 8, 10, 12]
clusters=[]

for x in range (0, len(n_clus)):
    #run the model with x clusters
    kmeans_model = KMeans(n_clusters= n_clus[x], init='k-means++', random_state = 180404, max_iter=300).fit (model.docvecs.vectors_docs)
    cl = kmeans_model.predict(model.docvecs.vectors_docs)
    clusters.append(cl)
    fb_data['cluster'] = cl
    fb_data['cluster'] = fb_data['cluster'].astype(str)
    
    #identify top-30 words in each of the clusters and solutions based on the tf-idf values
    NUMBER_OF_CLUSTERS = n_clus[x]
    tf_idf_clusters = []
    top_n = 30
    for i in range(NUMBER_OF_CLUSTERS):
        y = tfidfvectorizer.fit_transform(fb_data["Message_proc"][fb_data['cluster'] == i])
        tf_idf_clusters.append(y)
        feature_names = tfidfvectorizer.get_feature_names()
        print('tf_idf scores: \n', sorted(list(zip(tfidfvectorizer.get_feature_names(), y.sum(0).getA1())), key=lambda x: x[1], reverse=True)[:top_n])  
    

In [None]:
#re-run the model with the optimal number of clusters 
#and identify the the closest documents for the qualitative part of the analysis

NUMBER_OF_CLUSTERS = 8

kmeans_model = KMeans(n_clusters= 8, init='k-means++', random_state = 180404, max_iter=300).fit (model.docvecs.vectors_docs)
cl = kmeans_model.predict(model.docvecs.vectors_docs)
fb_data['cluster'] = cl

top_n = 30
for i in range(NUMBER_OF_CLUSTERS):
    y = tfidfvectorizer.fit_transform(fb_data["proc_tokens"][fb_data['cluster'] == i])
    feature_names = tfidfvectorizer.get_feature_names()
    print('tf_idf scores: \n', sorted(list(zip(tfidfvectorizer.get_feature_names(), y.sum(0).getA1())), key=lambda x: x[1], reverse=True)[:top_n])  

#fins cluster centroids    
centroids = kmeans_model.cluster_centers_

#find the documents closest to each of the cluster centroids
for i in range(NUMBER_OF_CLUSTERS):
    z = model.docvecs.most_similar(positive = [centroids[i]], topn = 50)
    df = pd.DataFrame(z, columns = ["id", "score"])
    df['id']=df['id'].astype(str)
    display(HTML(fb_data['Message_nice'][fb_data.index.isin(df['id'])].to_frame().to_html()))
    