# Analyzer 

In [1]:
from pymongo import MongoClient
import logger
import datetime
import json


class Community:

    def __init__(self, user, password):
        self.user = user
        self.password = password
        self.db = ""
        self.leaders = []
        self.posts = []
        self.key_words = []
        self.connect_db()

    def connect_db(self):
        log = logger.logger_handler()
        try:
            log.send_message_to_logfile("connecting")
            db = MongoClient(
                "mongodb+srv://" + self.user + ":" + self.password + "@makhela-qvsh8.mongodb.net/Makhela?ssl=true&ssl_cert_reqs=CERT_NONE")
            self.db = db.Makhela
            log.send_message_to_logfile("connected")
        except:
            log.send_message_to_logfile("failed to connect")

    def fetch_opinion_leaders(self, log):
        log.send_message_to_logfile("fetching opinion leaders")
        res_leaders = self.db["opinion_leaders"].find()
        leaders = {}
        for leader in res_leaders:
            try:
                leaders[int(leader["twitter_id"])] = leader
                leader["posts"] = []
            except:
                log.send_message_to_logfile("failed to fetch: ", leader)
                continue
        self.leaders = leaders

    def fetch_posts(self, log):
        log.send_message_to_logfile("fetching posts")
        posts = {}
        i = 0
        for key, value in self.leaders.items():
            i += 1
            res_posts = self.db["posts"].find({"leader_twitter_id": key})
            for post in res_posts:
                try:
                    posts[post["post_id"]] = post
                except:
                    log.send_message_to_logfile("exception ", key)
                    continue
        self.posts = posts

    def fetch_key_words(self, log):
        log.send_message_to_logfile("fatching keywords")
        try:
            keywords = self.db.keywords.find()
            try:
                for word in keywords:
                    self.key_words.append(word["word"])
                return self.key_words
            except:
                log.send_message_to_logfile("exception: ", word)
        except:
            log.send_message_to_logfile("failed to fetch keywords")

    def add_posts_to_leaders(self, log):
        log.send_message_to_logfile("adding posts to leaders")
        for key, value in self.posts.items():
            try:
                self.leaders[value['leader_twitter_id']]['posts'].append(value['full_text'])
            except:
                log.send_message_to_logfile("failed adding post:", key)

    def get_community(self):
        log = logger.logger_handler()
        self.fetch_opinion_leaders(log)
        self.fetch_posts(log)
        self.add_posts_to_leaders(log)
        self.fetch_key_words(log)
        return self.leaders, self.posts, self.key_words

    def save_community(self, leaders, posts):
        log = logger.logger_handler()
        log.send_message_to_logfile("saving leaders, posts to db")
        today = datetime.datetime.today()
        for key, value in leaders.items():
            try:
                leaders[key]["analyzed_date"] = today
                self.db["opinion_leaders"].replace_one({'twitter_id': key}, leaders[key])
            except:
                log.send_message_to_logfile("exception saving to db - opinion leaders: ", key)
                continue
        for key, value in posts.items():
            try:
                posts[key]["analyzed_date"] = today
                self.db["posts"].replace_one({'post_id': key}, posts[key])
            except:
                log.send_message_to_logfile("exception saving to db - posts: ", key)
                continue

    def save_topics(self, topics):
        log = logger.logger_handler()
        log.send_message_to_logfile("saving topics to db")
        try:
            self.db['topics'].insert_one(topics)
        except:
            log.send_message_to_logfile("exception saving topic ")
            pass

In [2]:
log = logger.logger_handler()

In [3]:
community = Community('sveta', 'Makhela123')
leaders, posts, key_words = community.get_community()

connecting
connected
fetching opinion leaders
fetching posts
adding posts to leaders
fatching keywords


In [10]:
def add_key_words(log, posts, key_words):
    log.send_message_to_logfile("adding key word indicator to post")
    for key, value in posts.items():
        try:
            word_found = 0
            for key_word in key_words:
                if key_word in value['full_text']:
                    word_found = 1
                    break
            posts[key]["key_word"] = word_found
        except:
            log.send_message_to_logfile("exception adding key word indicator to post: ", key)
            continue

In [11]:
add_key_words(log, posts, key_words)

adding key word indicator to post


In [21]:
def text(dict):
    if dict['retweeted_status_text'] != 'None':
        return dict['retweeted_status_text']
    elif dict['in_reply_to_status_text'] != 'None':
        return dict['in_reply_to_status_text']
    elif dict['quoted_status_text'] != 'None':
        return dict['quoted_status_text']
    else:
        return dict['full_text']

In [22]:
def important_words(log, posts):
    log.send_message_to_logfile("Finding important words")
    words_from_posts = []
    for key, value in posts.items():
        try:
            post_row = []
            post_row.append(key)
            post_row.append(text(posts[key]))
            words_from_posts.append(post_row)
        except:
            log.send_message_to_logfile("exception finding text in post: ", key)
            continue
    words_df = pd.DataFrame(words_from_posts)
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(words_df[1])
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    df = pd.DataFrame(denselist, columns=feature_names)

    words = []
    for i in range(len(df)):
        words.append(df.iloc[i, :][df.iloc[i, :] == df.iloc[i, :].max()].index.values.tolist())

    words_df['words'] = words

    for i in range(len(words_df)):
        try:
            posts[words_df.loc[i, 0]]['words'] = words_df.loc[i, 'words']
        except:
            log.send_message_to_logfile("exception while finding important wordst: ", i, words_df.loc[i, 'words'])
            continue

    return


In [25]:
import pandas as pd
import nltk;
nltk.download('stopwords')
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['http', 'https', 'co'])
import networkx as nx
from networkx.algorithms import community as nxc
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/svetagimpelson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
important_words(log, posts)

Finding important words


In [29]:
for key, value in posts.items():
    print(value['words'])

['asset', 'casualty', 'editors', 'fb90ygnqmb', 'layoffs', 'org']
['nature']
['may']
['e9hkvi1mjo', 'epidemiologists', 'nonexperts', 'odd', 'predictions', 'supremely']
['1v2px6vp3z', 'features']
['compassionate', 'irrespective', 'positive']
['kerr']
['lost']
['911', 'ahmaud', 'arbery', 'cellphone', 'georgia', 'moments', 'retraced', 'vupst60uct']
['he']
['house']
['billionaires', 'gates']
['feroza']
['games', 'rjvwqywnsd']
['his']
['hy8c4mil2m', 'st6yrlzexj']
['and']
['dad']
['enct4p9hil', 'mine', 'mothersday', 'singlemoms', 'taught', 'thankful', 'theatlantic']
['graduation', 'rpratqemq1', 'snlathome']
['cartoonist', 'lie', 'martyred', 'masqueraders', 'r69kk8lxus', 'xk4a0uyuzf']
['killers']
['erudite', 'forcefully', 'gilsinan', 'hilarious', 'incredibly', 'kathy', 'ttczitwc0f', 'warm']
['solution']
['worse']
['loyal', 'saijcoawm9', 'traditions']
['feds', 'yrvarboyfv']
['8micwfdqqq']
['never']
['mp4figjnqe']
['herat', 'railway']
['maintenance', 'propping']
['oil']
['gilan', 'harvesting', '

In [35]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [50]:
data = leaders[47156708]['posts']

In [52]:
def my_lda(data):
    data = [re.sub('#', '', sent) for sent in data]
    data = [re.sub('\s+', ' ', sent) for sent in data]
    data = [re.sub("\'", "", sent) for sent in data]
    data_words = list(sent_to_words(data))
    bigram = gensim.models.Phrases(data_words, min_count=5,
                                                threshold=100)  # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_words_nostops = remove_stopwords(data_words)
    data_words_bigrams = make_bigrams(data_words_nostops)
    nlp = spacy.load('en', disable=['parser', 'ner'])
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    texts = data_lemmatized

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=3,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
    return lda_model.print_topics()

In [55]:
for key, value in leaders.items():
    if value['posts']:
        leaders[key]['topics'] = my_lda(value['posts'])

In [57]:
def build_graph(log, leaders):
    log.send_message_to_logfile("Building Graph")
    DG = nx.DiGraph()
    try:
        DG.add_nodes_from(leaders)
    except:
        log.send_message_to_logfile("failed adding nodes")
        pass
    try:
        for key, value in leaders.items():
            try:
                for following in value['community_following']:
                    DG.add_edge(key, following['twitter_id'])
            except:
                continue
    except:
        log.send_message_to_logfile("failed adding edges ", key)
    return DG

In [58]:
build_graph = build_graph(log, leaders)

Building Graph


In [61]:
DG.edges()

OutEdgeView([(47156708, 48252327), (47156708, 488772327), (47156708, 822499045763674112), (47156708, 1515956552), (47156708, 1515812534), (47156708, 14926939), (47156708, 49702592), (47156708, 3048160253), (47156708, 2739262472), (47156708, 1114663532), (47156708, 288869649), (47156708, 480678403), (47156708, 57550888), (47156708, 1151625187), (47156708, 61051322), (47156708, 482120066), (47156708, 376435807), (47156708, 47633485), (47156708, 175330919), (47156708, 1829301998), (47156708, 22899683), (47156708, 175082335), (47156708, 27966935), (47156708, 41789292), (47156708, 29016548), (47156708, 47635420), (47156708, 2693777042), (2759399473, 1515812534), (2759399473, 910253568397053952), (2759399473, 2785367359), (2759399473, 64725597), (2759399473, 48252327), (2759399473, 288869649), (2759399473, 3048160253), (2759399473, 61051322), (2759399473, 480678403), (2759399473, 30181014), (2759399473, 2190601093), (2759399473, 482120066), (2759399473, 2311830163), (2759399473, 43380129), (

In [78]:
def centrality(log, DG, leaders):
    log.send_message_to_logfile("Analyzing centrality")
    try:
        deg_centrality = nx.degree_centrality(DG)
    except:
        log.send_message_to_logfile("exception finding degree_centrality")
    try:
        betweenness_centrality = nx.betweenness_centrality(DG)
    except:
        log.send_message_to_logfile("exception finding betweenness_centrality")
    try:
        closeness_centrality = nx.closeness_centrality(DG)
    except:
        log.send_message_to_logfile("exception finding closeness_centrality")

    for key, value in leaders.items():
        try:
            leaders[key]["deg_centrality"] = deg_centrality[key]
        except:
            log.send_message_to_logfile("exception adding deg_centrality: ", key)
            continue
        try:
            leaders[key]["betweenness_centrality"] = betweenness_centrality[key]
        except:
            log.send_message_to_logfile("exception adding betweenness_centrality: ", key)
            pass
        try:
            leaders[key]["closeness_centrality"] = closeness_centrality[key]
        except:
            log.send_message_to_logfile("exception adding closeness_centrality: ", key)
            pass
    return 
   

In [79]:
centrality(log, DG, leaders)

Analyzing centrality


In [80]:
for key, value in leaders.items():
    try:
        print(value["deg_centrality"])
        print(value["betweenness_centrality"])
        print(value["closeness_centrality"])
        print("_____________")
    except:
        print(key)
        print("_____________")

1.1551724137931034
0.034815410721857364
0.7118226600985221
_____________
0.0
0.0
0.0
_____________
1.1379310344827587
0.010685095648662159
0.5979310344827586
_____________
1.1724137931034482
0.03701766258604027
0.6060111835973905
_____________
0.9482758620689655
0.003959548058816747
0.5900635208711433
_____________
0.10344827586206896
6.481721545242417e-05
0.4114204365707055
_____________
1.0862068965517242
0.013308089443164941
0.5154577883472057
_____________
0.7068965517241379
0.011705487630942677
0.5154577883472057
_____________
0.25862068965517243
4.174714942396117e-05
0.49280030314513074
_____________
0.48275862068965514
0.004118697565284608
0.49280030314513074
_____________
1.0689655172413792
0.02173431720164434
0.6316172899465761
_____________
0.5689655172413793
0.0020333870420551456
0.4874437781109445
_____________
0.7758620689655172
0.003842228410898369
0.553639846743295
_____________
0.896551724137931
0.015591394178291288
0.6499250374812593
_____________
0.5344827586206896
0.

In [81]:
def community(log, DG, leaders):
    log.send_message_to_logfile("Analyzing community")
    try:
        communities_generator = nxc.girvan_newman(DG)
        top_level_communities = next(communities_generator)
        next_level_communities = next(communities_generator)
        community_found = sorted(map(sorted, next_level_communities))
            
        counter = 1
        for community in community_found:
            if len(community) == 1:
                try:
                    leaders[community[0]]['community'] = 0
                except:
                    log.send_message_to_logfile("failed adding community to ", community[0])
                    pass
            else:
                for c in community:
                    try:
                        leaders[c]['community'] = counter
                    except:
                        log.send_message_to_logfile("failed adding community to ", c)
                        continue
                counter += 1
    except:
        log.send_message_to_logfile("failed analyzing community")

In [82]:
community(log, DG, leaders)

Analyzing community


In [83]:
for key, value in leaders.items():
    try:
        print(value["community"])
        print("_____________")
    except:
        print(key)
        print("_____________")

1
_____________
0
_____________
1
_____________
1
_____________
1
_____________
0
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
0
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
0
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________
1
_____________


In [88]:
community_posts ={}
for key, value in leaders.items():
    try:
        community_posts[value['community']]+=value['posts']
    except:
        community_posts[value['community']]=value['posts']
topics = {}
for key, value in community_posts.items():
    try:
        topics[key] = my_lda(value)
    except:
        log.send_message_to_logfile("exception LDA", key)
        

In [89]:
topics

{1: [(0,
   '0.029*"nuclear" + 0.009*"arm" + 0.009*"go" + 0.009*"testing" + 0.008*"trump" + 0.007*"year" + 0.006*"back" + 0.006*"start" + 0.006*"get" + 0.006*"weapon"'),
  (1,
   '0.019*"iranian" + 0.011*"year" + 0.010*"say" + 0.009*"policy" + 0.008*"government" + 0.008*"force" + 0.007*"country" + 0.007*"include" + 0.006*"today" + 0.006*"must"'),
  (2,
   '0.019*"test" + 0.012*"make" + 0.012*"case" + 0.010*"new" + 0.010*"death" + 0.008*"would" + 0.008*"first" + 0.007*"s" + 0.007*"world" + 0.006*"see"')],
 0: [(0,
   '0.028*"would" + 0.026*"even" + 0.025*"deal" + 0.025*"crude" + 0.025*"bad" + 0.025*"need" + 0.024*"ally" + 0.023*"thing" + 0.023*"impression" + 0.023*"awful"'),
  (1,
   '0.021*"see" + 0.021*"would" + 0.021*"new" + 0.021*"nuclear" + 0.021*"start" + 0.019*"call" + 0.019*"extend" + 0.019*"include" + 0.019*"year" + 0.019*"time"'),
  (2,
   '0.010*"force" + 0.010*"nuclear" + 0.010*"use" + 0.008*"decision" + 0.008*"spend" + 0.008*"withdraw" + 0.008*"trump" + 0.008*"arm" + 0.008*

In [90]:
posts_data = []
for key, value in posts.items():
    posts_data.append(value['full_text'])
try:
    posts_topics = my_lda(posts_data)
except:
    log.send_message_to_logfile("exception LDA")
                

In [92]:
posts_topics

[(0,
  '0.016*"say" + 0.010*"arm" + 0.010*"nuclear" + 0.009*"case" + 0.009*"go" + 0.009*"death" + 0.009*"make" + 0.008*"testing" + 0.007*"know" + 0.006*"start"'),
 (1,
  '0.014*"year" + 0.013*"iranian" + 0.010*"would" + 0.008*"trump" + 0.008*"s" + 0.007*"today" + 0.006*"time" + 0.006*"may" + 0.006*"new" + 0.006*"state"'),
 (2,
  '0.022*"nuclear" + 0.018*"test" + 0.009*"weapon" + 0.009*"first" + 0.009*"see" + 0.007*"give" + 0.007*"also" + 0.007*"be" + 0.006*"world" + 0.006*"long"')]

In [94]:
topics_data = {}
today = datetime.datetime.today()
topics_data['network'] = str(posts_topics)
topics_data['communities'] = str(topics)
topics_data['date'] = str(today)

In [95]:
topics_data

{'network': '[(0, \'0.016*"say" + 0.010*"arm" + 0.010*"nuclear" + 0.009*"case" + 0.009*"go" + 0.009*"death" + 0.009*"make" + 0.008*"testing" + 0.007*"know" + 0.006*"start"\'), (1, \'0.014*"year" + 0.013*"iranian" + 0.010*"would" + 0.008*"trump" + 0.008*"s" + 0.007*"today" + 0.006*"time" + 0.006*"may" + 0.006*"new" + 0.006*"state"\'), (2, \'0.022*"nuclear" + 0.018*"test" + 0.009*"weapon" + 0.009*"first" + 0.009*"see" + 0.007*"give" + 0.007*"also" + 0.007*"be" + 0.006*"world" + 0.006*"long"\')]',
 'communities': '{1: [(0, \'0.029*"nuclear" + 0.009*"arm" + 0.009*"go" + 0.009*"testing" + 0.008*"trump" + 0.007*"year" + 0.006*"back" + 0.006*"start" + 0.006*"get" + 0.006*"weapon"\'), (1, \'0.019*"iranian" + 0.011*"year" + 0.010*"say" + 0.009*"policy" + 0.008*"government" + 0.008*"force" + 0.007*"country" + 0.007*"include" + 0.006*"today" + 0.006*"must"\'), (2, \'0.019*"test" + 0.012*"make" + 0.012*"case" + 0.010*"new" + 0.010*"death" + 0.008*"would" + 0.008*"first" + 0.007*"s" + 0.007*"world"

In [104]:
db = MongoClient(
    "mongodb+srv://" + 'sveta' + ":" + 'Makhela123' + "@makhela-qvsh8.mongodb.net/Makhela?ssl=true&ssl_cert_reqs=CERT_NONE")
db = db.Makhela

In [102]:
leaders

{47156708: {'_id': ObjectId('5ec695fd72262063a43aaaa9'),
  'community_following': [{'twitter_screen_name': 'mdubowitz',
    'twitter_id': 48252327,
    'found_date': datetime.datetime(2020, 5, 22, 16, 20, 50, 824000)},
   {'twitter_screen_name': 'AlirezaNader',
    'twitter_id': 488772327,
    'found_date': datetime.datetime(2020, 5, 22, 16, 20, 50, 824000)},
   {'twitter_screen_name': 'Norman_Roule',
    'twitter_id': 822499045763674112,
    'found_date': datetime.datetime(2020, 5, 22, 16, 20, 50, 824000)},
   {'twitter_screen_name': 'AmnestyIran',
    'twitter_id': 1515956552,
    'found_date': datetime.datetime(2020, 5, 22, 16, 20, 50, 824000)},
   {'twitter_screen_name': 'RZimmt',
    'twitter_id': 1515812534,
    'found_date': datetime.datetime(2020, 5, 22, 16, 20, 50, 824000)},
   {'twitter_screen_name': 'jrezaian',
    'twitter_id': 14926939,
    'found_date': datetime.datetime(2020, 5, 22, 16, 20, 50, 824000)},
   {'twitter_screen_name': 'barbaraslavin1',
    'twitter_id': 4970

In [136]:
for key, value in leaders.items():
    try:
        del value['posts']
    except:
        continue

In [137]:
log = logger.logger_handler()
log.send_message_to_logfile("saving leaders, posts to db")
today = datetime.datetime.today()
for key, value in leaders.items():
    try:
        leaders[key]["analyzed_date"] = today
        db["opinion_leaders"].replace_one({'twitter_id': key}, leaders[key])
    except:
        log.send_message_to_logfile("exception saving to db - opinion leaders: ", key)
        continue
for key, value in posts.items():
    try:
        posts[key]["analyzed_date"] = today
        db["posts"].replace_one({'post_id': key}, posts[key])
    except:
        log.send_message_to_logfile("exception saving to db - posts: ", key)
        continue

saving leaders, posts to db


In [140]:
log.send_message_to_logfile("saving topics to db")
db.topics.insert_one(topics_data)

saving topics to db


<pymongo.results.InsertOneResult at 0x1a290ea280>

In [143]:
dd = db.topics.find()

In [144]:
for d in dd:
    print(d)

{'_id': ObjectId('5ec832c46c47d2fff4164827'), 'network': '[(0, \'0.010*"attack" + 0.007*"be" + 0.007*"iranian" + 0.007*"never" + 0.007*"work" + 0.006*"government" + 0.006*"look" + 0.006*"leader" + 0.006*"give" + 0.005*"return"\'), (1, \'0.017*"say" + 0.011*"would" + 0.010*"trump" + 0.009*"new" + 0.009*"iranian" + 0.008*"arm" + 0.006*"day" + 0.006*"also" + 0.006*"kill" + 0.005*"get"\'), (2, \'0.013*"nuclear" + 0.009*"tell" + 0.008*"use" + 0.007*"take" + 0.006*"see" + 0.006*"official" + 0.006*"sanction" + 0.005*"right" + 0.005*"year" + 0.005*"say"\')]', 'communities': '{1: [(0, \'0.012*"use" + 0.010*"country" + 0.008*"world" + 0.007*"thank" + 0.007*"می" + 0.007*"help" + 0.007*"time" + 0.006*"statement" + 0.006*"conflict" + 0.006*"support"\'), (1, \'0.016*"say" + 0.016*"nuclear" + 0.014*"iranian" + 0.014*"trump" + 0.011*"attack" + 0.010*"official" + 0.009*"new" + 0.009*"people" + 0.008*"year" + 0.008*"tell"\'), (2, \'0.016*"say" + 0.014*"iranian" + 0.013*"security" + 0.010*"force" + 0.010

In [108]:
import pandas as pd
import nltk;
nltk.download('stopwords')
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['http', 'https', 'co'])
import networkx as nx
from networkx.algorithms import community as nxc
from sklearn.feature_extraction.text import TfidfVectorizer

import logger

class Analyzer:

    def __init__(self, leaders, posts, key_words):
        self.leaders = leaders
        self.posts = posts
        self.key_words = key_words
        self.DG = '',
        self.topics_data = {}

    def analyze_community(self):
        log = logger.logger_handler()
        log.send_message_to_logfile("Beginning to analyze community")

        self.add_key_words(log)
        self.important_words(log)
        
        log.send_message_to_logfile("Analyzing LDA Leaders")
        for key, value in self.leaders.items():
            try:
                if value['posts']:
                    self.leaders[key]['topics'] = self.analyze_lda(value['posts'], log)
                else:
                    self.leaders[key]['topics'] = []
            except:
                log.send_message_to_logfile("Failed LDA leader", key)
                continue
                
        self.build_graph(log)
        self.centrality(log)
        self.community(log)
        
        log.send_message_to_logfile("Analyzing LDA - Communities")
        community_posts ={}
        for key, value in self.leaders.items():
            try:
                community_posts[value['community']]+=value['posts']
            except:
                community_posts[value['community']]=value['posts']
        topics = {}
        for key, value in community_posts.items():
            try:
                topics[key] = self.analyze_lda(value, log)
            except:
                 log.send_message_to_logfile("exception LDA", key)
        
        log.send_message_to_logfile("Analyzing LDA - Network")
        posts_data = []
        for key, value in self.posts.items():
            posts_data.append(value['full_text'])
        try:
            posts_topics = self.analyze_lda(posts_data, log)
        except:
              log.send_message_to_logfile("exception LDA")
                
        today = datetime.datetime.today()
        self.topics_data['network'] = str(posts_topics)
        self.topics_data['communities'] = str(topics)
        self.topics_data['date'] = str(today)
        
        
        for key, value in self.leaders.items():
            try:
                del value['posts']
            except:
                continue
        return self.leaders, self.posts, self.topics_data


    def add_key_words(self, log):
        log.send_message_to_logfile("adding key word indicator to post")
        for key, value in self.posts.items():
            try:
                word_found = 0
                for key_word in self.key_words:
                    if key_word in value['full_text']:
                        word_found = 1
                        break
                self.posts[key]["key_word"] = word_found
            except:
                log.send_message_to_logfile("exception adding key word indicator to post: ", key)
                continue

    def text(self, dict):
        if dict['retweeted_status_text'] != 'None':
            return dict['retweeted_status_text']
        elif dict['in_reply_to_status_text'] != 'None':
            return dict['in_reply_to_status_text']
        elif dict['quoted_status_text'] != 'None':
            return dict['quoted_status_text']
        else:
            return dict['full_text']

    def important_words(self, log):
        log.send_message_to_logfile("Finding important words")
        words_from_posts = []
        for key, value in self.posts.items():
            try:
                post_row = []
                post_row.append(key)
                post_row.append(self.text(self.posts[key]))
                words_from_posts.append(post_row)
            except:
                log.send_message_to_logfile("exception finding text in post: ", key)
                continue
        words_df = pd.DataFrame(words_from_posts)
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(words_df[1])
        feature_names = vectorizer.get_feature_names()
        dense = vectors.todense()
        denselist = dense.tolist()
        df = pd.DataFrame(denselist, columns=feature_names)

        words = []
        for i in range(len(df)):
            words.append(df.iloc[i, :][df.iloc[i, :] == df.iloc[i, :].max()].index.values.tolist())

        words_df['words'] = words

        for i in range(len(words_df)):
            try:
                self.posts[words_df.loc[i, 0]]['words'] = words_df.loc[i, 'words']
            except:
                log.send_message_to_logfile("exception while finding important wordst: ", i, words_df.loc[i, 'words'])
                continue

        return

    def analyze_lda(self, data, log):
        log.send_message_to_logfile("Analyzing LDA")        
        
        def sent_to_words(sentences):
            for sentence in sentences:
                yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

        # Define functions for stopwords, bigrams, trigrams and lemmatization
        def remove_stopwords(texts):
            return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

        def make_bigrams(texts):
            return [bigram_mod[doc] for doc in texts]

        def make_trigrams(texts):
            return [trigram_mod[bigram_mod[doc]] for doc in texts]

        def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
            """https://spacy.io/api/annotation"""
            texts_out = []
            for sent in texts:
                doc = nlp(" ".join(sent))
                texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
            return texts_out
        
        def lda_analyzing(data):
            try:
                data = [re.sub('#', '', sent) for sent in data]
                data = [re.sub('\s+', ' ', sent) for sent in data]
                data = [re.sub("\'", "", sent) for sent in data]
                data_words = list(sent_to_words(data))
                # Build the bigram and trigram models
                bigram = gensim.models.Phrases(data_words, min_count=5,
                                                threshold=100)  # higher threshold fewer phrases.
                trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

                # Faster way to get a sentence clubbed as a trigram/bigram
                bigram_mod = gensim.models.phrases.Phraser(bigram)
                trigram_mod = gensim.models.phrases.Phraser(trigram)
                # Remove Stop Words
                data_words_nostops = remove_stopwords(data_words)

                # Form Bigrams
                data_words_bigrams = make_bigrams(data_words_nostops)

                # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
                # python3 -m spacy download en
                nlp = spacy.load('en', disable=['parser', 'ner'])

                # Do lemmatization keeping only noun, adj, vb, adv
                data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

                # Create Dictionary
                id2word = corpora.Dictionary(data_lemmatized)

                # Create Corpus
                texts = data_lemmatized

                # Term Document Frequency
                corpus = [id2word.doc2bow(text) for text in texts]

                # Build LDA model
                lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                            id2word=id2word,
                                                            num_topics=3,
                                                            random_state=100,
                                                            update_every=1,
                                                            chunksize=100,
                                                            passes=10,
                                                            alpha='auto',
                                                            per_word_topics=True)
                return lda_model.print_topics()
            except:
                log.send_message_to_logfile("LDA failed")
                pass
            
        return lda_analyzing(data)

    def build_graph(self, log):
        log.send_message_to_logfile("Building Graph")
        self.DG = nx.DiGraph()
        try:
            self.DG.add_nodes_from(self.leaders)
        except:
            log.send_message_to_logfile("failed adding nodes")
            pass
        try:
            for key, value in self.leaders.items():
                try:
                    for following in value['community_following']:
                        self.DG.add_edge(key, following['twitter_id'])
                except:
                    continue
        except:
            log.send_message_to_logfile("failed adding edges ", key)

    def centrality(self, log):
        log.send_message_to_logfile("Analyzing centrality")
        try:
            deg_centrality = nx.degree_centrality(self.DG)
        except:
            log.send_message_to_logfile("exception finding degree_centrality")
        try:
            betweenness_centrality = nx.betweenness_centrality(self.DG)
        except:
            log.send_message_to_logfile("exception finding betweenness_centrality")
        try:
            closeness_centrality = nx.closeness_centrality(self.DG)
        except:
            log.send_message_to_logfile("exception finding closeness_centrality")

        for key, value in self.leaders.items():
            try:
                self.leaders[key]["deg_centrality"] = deg_centrality[key]
            except:
                log.send_message_to_logfile("exception adding deg_centrality: ", key)
                continue
        try:
            self.leaders[key]["betweenness_centrality"] = betweenness_centrality[key]
        except:
            log.send_message_to_logfile("exception adding betweenness_centrality: ", key)
            pass
        try:
            self.leaders[key]["closeness_centrality"] = closeness_centrality[key]
        except:
            log.send_message_to_logfile("exception adding closeness_centrality: ", key)
            pass

    def community(self, log):
        log.send_message_to_logfile("Analyzing community")
        try:
            communities_generator = nxc.girvan_newman(self.DG)
            top_level_communities = next(communities_generator)
            next_level_communities = next(communities_generator)
            community_found = sorted(map(sorted, next_level_communities))
            
            counter = 1
            for community in community_found:
                if len(community) == 1:
                    try:
                        self.leaders[community[0]]['community'] = 0
                    except:
                        log.send_message_to_logfile("failed adding community to ", community[0])
                        pass
                else:
                    for c in community:
                        try:
                            self.leaders[c]['community'] = counter
                        except:
                            log.send_message_to_logfile("failed adding community to ", c)
                            continue
                    counter += 1
        except:
            log.send_message_to_logfile("failed analyzing community")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/svetagimpelson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [112]:
log = logger.logger_handler()

In [113]:
community = Community('sveta', 'Makhela123')
leaders, posts, key_words = community.get_community()

connecting
connected
fetching opinion leaders
fetching posts
adding posts to leaders
fatching keywords


In [114]:
analyzer = Analyzer(leaders, posts, key_words)
leaders, posts, topics = analyzer.analyze_community()

In [115]:
community.save_community(leaders, posts)
community.save_topics(topics)

adding key word indicator to post
