In [1]:
import numpy as np
import networkx as nx
import pandas as pd

In [2]:
from twkit.utils import *
from twkit.crawler.fillfollow import add100_id

In [3]:
import twkit

### 1. Add starting point, start crawling

In [None]:
!python3 bin/add_user.py <username>

In [None]:
!bash scripts/start-all.sh

### 2. Export users from users collections

In [None]:
db, api = init_state()

num_users = db.users.count()
user_ids = db.users.find(projection={"_id":0,"id": 1,})
ids_dict = list(user_ids)

ids = [list(u.values())[0] for u in ids_dict]
np.save("user.npy", ids)

In [4]:
users = np.load("user.npy")

In [5]:
len(users)

249346

### 3. Add more users to be tracked

In [None]:
users = np.load("user.npy")
verbose(False)
db, api = init_state(use_cache=False)

idlist = []
for user in users:
    userid = int(user)
    if not can_follow(db, userid, False): continue
    idlist.append(userid)
    if len(idlist) > 99:
        addedlist = add100_id(db, api, idlist)
        idlist = []
        for u in addedlist:
            add_to_followed(db, u['id'], u['screen_name_lower'], u.get('protected', False))
    if len(idlist):
        addedlist = add100_id(db, api, idlist)
    for u in addedlist:
        add_to_followed(db, u['id'], u['screen_name_lower'], u.get('protected', False))

In [3]:
users = np.load("user.npy")

In [16]:
users = np.load("bin/user.npy")

In [17]:
len(users)

48201

### 4. Export graphs

In [7]:
!python3 twkit/curation/exportfollow.py

Traceback (most recent call last):
  File "/Users/zircon/Documents/GitHub/twAwler/twkit/curation/exportfollow.py", line 17, in <module>
    from twkit.utils import *
ModuleNotFoundError: No module named 'twkit'


In [None]:
!python3 twkit/curation/exportmention.py

In [None]:
!python3 twkit/curation/exportquote.py

In [None]:
!python3 twkit/curation/exportreply.py

### 5. Create Graph

#### Initial Follow Graph

In [6]:
df1 = pd.read_csv("follow2.txt", sep=' ', names=["start","end"])
edge_list_new = df1.values.tolist()

In [7]:
G = nx.Graph()
G.add_edges_from(edge_list_new)

In [8]:
print(len(G.nodes()),len(G.edges()))

2532573 5989937


#### Get induced graph on users

In [9]:
induced_G = G.subgraph(users)

In [10]:
print(len(induced_G.edges),len(induced_G.nodes))

727138 185533


### 6. Export users whose tweets matches keywords

#### Using keywords related to vaccination

In [67]:
keywords = ['vaccine', 'vaccination','vaccinate','vax', 'vaccinated']

def hashtag_keywords_check(keywords, hashtags):
    for hashtag in hashtags:
        if hashtag.lower() in keywords:
            return True
    return False

In [23]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def word_normalization(word):
    word = ''.join(w for w in word if w not in string.punctuation) # Remove the punctuations in the word
    word = word.lower() # Lowercase the word
    word = lemmatizer.lemmatize(word) # Lemmatise the word
    return word

def tweets_keywords_check(keywords, tweets):
    word_list = nltk.word_tokenize(tweets) # Tokenize the sentences
    word_list = list(map(word_normalization, word_list))
    
    for keyword in keywords:
        if keyword in word_list:
            return True
    return False

In [28]:
def export_users_using_keywords(keywords):
    db, api = init_state()
    tweets = db.tweets.find()
    users_dict = {}
    for tw in tweets:
        if 'hashtags' in tw and tw['created_at'].year >= 2019:
            if hashtag_keywords_check(keywords, tw['hashtags']) or tweets_keywords_check(keywords, tw['text']):
                if tw['user']['id'] not in users_dict:
                    users_dict[tw['user']['id']] = []
                    
                text = tw['text']
                subjectivity = TextBlob(text).sentiment.subjectivity
                polarity = TextBlob(text).sentiment.polarity
                
                desc = {"tweet_id": tw['id'], "created_at": tw['created_at'],
                        "subjectivity": subjectivity, "polarity": polarity}
                
                users_dict[tw['user']['id']].append(desc)
                
    print("Finished checking, start saving")
    with open('user_tweets.json', 'w') as fp:
        json.dump(users_dict, fp)
    users = list(users_dict.keys())
    np.save("keyword_users.npy", np.array(users))

In [29]:
import string
from textblob import TextBlob
import json

In [40]:
db, api = init_state()
tweets = db.tweets.find()
users_dict = {}
for tw in tweets:
    # Check if tweets has hashtags and is posted after 2019
    if 'hashtags' in tw and tw['created_at'].year >= 2019:
        # Check if keywords matches
        if hashtag_keywords_check(keywords, tw['hashtags']) or tweets_keywords_check(keywords, tw['text']):
            if tw['user']['id'] not in users_dict:
                users_dict[tw['user']['id']] = []
            # Turn timestamp(datetime.datetime) to dictionary
            created = tw['created_at']
            timestamp = {'year':created.year,'month':created.month,'day':created.day,
                         'hour':created.hour,'minute':created.minute,'second':created.second}
            # Compute score for tweet texts
            text = tw['text']
            subjectivity = TextBlob(text).sentiment.subjectivity
            polarity = TextBlob(text).sentiment.polarity
            
            # Create dictionary for tweets    
            desc = {"tweet_id": tw['id'], "created_at": timestamp,
                    "subjectivity": subjectivity, "polarity": polarity}
            
            # Append this dictionary to the corresponding user
            users_dict[tw['user']['id']].append(desc)
                
print("Finished checking, start saving")

# Save the user-tweets dictionary
with open('user_tweets.json', 'w') as fp:
    json.dump(users_dict, fp)
    
# Save the users for induced graph
users = list(users_dict.keys())
np.save("keyword_users.npy", np.array(users))

Finished checking, start saving


In [71]:
kw_user = np.load("keyword_users.npy")

# Select nodes that are in the induced graph
current_users = list(induced_G.nodes())
induced_users = []
for i in kw_user:
    if i in current_users:
        induced_users.append(i)

In [48]:
latest_G = induced_G.subgraph(induced_users)
print("Number of nodes left: ", len(list(latest_G.nodes())))
print("Number of edges left: ", len(list(latest_G.edges())))

Number of nodes left:  4547
Number of edges left:  17534


In [49]:
nx.write_gpickle(latest_G,"latest.gpickle")

In [None]:
# Functions for extracting tweets of users (Useless right now)
def export_users_tweets(keywords):
    db, api = init_state()
    tweets = db.tweets.find()

    for tw in tweets:
        if 'hashtags' in tw:
            if tw['user']['id'] in induced_users:
                if hashtag_keywords_check(keywords, tw['hashtags']) or tweets_keywords_check(keywords, tw['text']):
                    user_tweets[tw['user']['id']].append(tw['text'])
    
    with open('user_tweets.json', 'w') as fp:
        json.dump(user_tweets, fp)

export_users_tweets(keywords)

user_tweets = {}
for user in induced_users:
    user_tweets[user] = []

### Find the second keyword list

In [None]:
db, api = init_state()
tweets = db.tweets.find()

hashtags_count = {}

for tw in tweets:
    if 'hashtags' in tw and tw['user']['id'] in current_users:
        for hashtag in tw['hashtags']:
            if hashtag not in hashtags_count:
                hashtags_count[hashtag] = 0
            hashtags_count[hashtag] += 1

In [54]:
hashtags_sorted = sorted(list(hashtags_count.items()),key=lambda x: x[1])

In [58]:
reversed_hashtags_sorted = hashtags_sorted[::-1]

In [60]:
reversed_hashtags_sorted[:30]

[('モンスト', 76),
 ('NYC', 35),
 ('Biden', 34),
 ('Ukraine', 31),
 ('Russia', 29),
 ('svpol', 22),
 ('GH', 21),
 ('WhiteCoatSummit', 20),
 ('politik', 17),
 ('nyc', 17),
 ('モンストプリズン', 16),
 ('6周年カウントダウン', 16),
 ('Putin', 16),
 ('NewYorkCity', 13),
 ('COVID19', 13),
 ('モンストドラえもん', 12),
 ('オーブ毎週50個以上配布', 12),
 ('Trump', 12),
 ('Afghanistan', 12),
 ('HunterBiden', 9),
 ('JudgeKetanjiBrownJackson', 9),
 ('Taliban', 9),
 ('BlackLivesMatter', 9),
 ('US', 9),
 ('IMPEACHBIDENNOW', 8),
 ('biden', 8),
 ('Florida', 8),
 ('Democrats', 8),
 ('JoeBiden', 8),
 ('WillSmith', 8)]

In [68]:
keywords_2 = ['ukraine', 'russia','ukrainewar','ukrainerussianwar','war','russiainvadedukraine','invasion']

db, api = init_state()
tweets = db.tweets.find()
users_dict_2 = {}
for tw in tweets:
    # Check if tweets has hashtags and is posted after 2019
    if 'hashtags' in tw and tw['created_at'].year >= 2021:
        # Check if keywords matches
        if hashtag_keywords_check(keywords_2, tw['hashtags']) or tweets_keywords_check(keywords_2, tw['text']):
            if tw['user']['id'] not in users_dict_2:
                users_dict_2[tw['user']['id']] = []
            # Turn timestamp(datetime.datetime) to dictionary
            created = tw['created_at']
            timestamp = {'year':created.year,'month':created.month,'day':created.day,
                         'hour':created.hour,'minute':created.minute,'second':created.second}
            # Compute score for tweet texts
            text = tw['text']
            subjectivity = TextBlob(text).sentiment.subjectivity
            polarity = TextBlob(text).sentiment.polarity
            
            # Create dictionary for tweets    
            desc = {"tweet_id": tw['id'], "created_at": timestamp,
                    "subjectivity": subjectivity, "polarity": polarity}
            
            # Append this dictionary to the corresponding user
            users_dict_2[tw['user']['id']].append(desc)
                
print("Finished checking, start saving")

# Save the user-tweets dictionary
with open('user_tweets_2.json', 'w') as fp:
    json.dump(users_dict_2, fp)
    
# Save the users for induced graph
users_2 = list(users_dict_2.keys())
np.save("keyword_users_2.npy", np.array(users_2))

Finished checking, start saving


In [69]:
len(users_2)

6587

In [72]:
kw_user_2 = np.load("keyword_users_2.npy")
induced_users_2 = []
for i in kw_user_2:
    if i in current_users:
        induced_users_2.append(i)

#### Get final graph

In [73]:
final_users = []
for i in induced_users:
    if i in induced_users_2:
        final_users.append(i)

In [75]:
final_graph = latest_G.subgraph(final_users)

In [76]:
print("Number of nodes finally:",len(list(final_graph.nodes())))
print("Number of edges finally:",len(list(final_graph.edges())))

Number of nodes finally: 3453
Number of edges finally: 11508


In [77]:
nx.write_gpickle(final_graph,"final.gpickle")