In [1]:
import pandas as pd
import numpy as np
import csv

from glob import glob
import os
import swifter
import shutil

import tweepy
from ttp import ttp

import time
import datetime
from calendar import timegm

from utils.casIn.user_influence import P,influence
from utils.common_utils import get_root_dir, merge_csvs
from utils.twitter_authentication import *
from utils.profilescraper import profileScraper
from process import create_cascades, processor, process_scraped_profile, get_sentiment

In [2]:
def get_influence(df):
    df = df.reset_index(drop=True)

    p_ij = P(df,r = -0.000068)
    inf, m_ij = influence(p_ij)
    df['inf'] = inf
    df = df[['ID', 'inf', 'cascade']]
    return df

def get_file_name():
    fname = get_root_dir() + '/data/temp/rescraped.csv'

    if os.path.isfile(fname):
        pass
    else:
        # create output file and add header
        with open(fname, 'w') as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            
            header = ['timestamp','id','text','likes','retweets','username','user_id','user_created_at','in_response_to', 
                      'in_response_to_user_id', 'response_type', 'has_geolocation', 'is_verified', 'total_tweets', 'total_followers', 
                      'total_following', 'total_likes', 'total_lists', 'has_background', 'is_protected', 'default_profile']
            
            writer.writerow(header)
    
    return fname

def write_csv(row_data):
    filename = get_file_name()

    with open(filename, 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(row_data)

def make_tweets(find_id, original):
    #fix this error
    found = original[original['in_response_to_id'] == find_id]
    p = ttp.Parser()
    parsed = p.parse(found.iloc[0]['Tweet'])
    
    current_tweet = {}
    
    current_tweet['ID'] = found.iloc[0].in_response_to_id
    current_tweet['Tweet'] = found.iloc[0].Tweet
    current_tweet['Time'] = found.iloc[0].Time - (found.iloc[1].Time - found.iloc[0].Time)
    try:
        current_tweet['User'] = parsed.users[0]
    except:
        current_tweet['User'] = ''
    current_tweet['Likes'] = 0
    current_tweet['Retweets'] = 0
    current_tweet['in_response_to_id'] = 0
    current_tweet['response_type'] = 'tweet'
    
    return pd.Series(current_tweet)

def rescrape_and_add(original, to_scrape):
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)

    print("Rescraping {} tweets".format(len(to_scrape)))
    for i in range(100, len(to_scrape)+100, 100):
        print("{} {}".format(i-100, i))
        
        tweets = api.statuses_lookup(list(to_scrape['index'][i-100:i].values),tweet_mode='extended')
        
        for tweet in     tweets:
            response_type = 'tweet'
            in_response_to = None

            try:
                in_response_to = tweet.in_reply_to_status_id
                in_response_to_user_id = tweet.in_reply_to_user_id_str
            except:
                pass

            if in_response_to == None:
                if hasattr(tweet, 'retweeted_status'):
                    response_type = 'retweet'
                    in_response_to = tweet.retweeted_status.id
                    in_response_to_user_id = tweet.retweeted_status.user._json['id_str'] #probably not required
                else:
                    if hasattr(tweet, 'quoted_status'):
                        response_type = 'quoted_retweet'
                        in_response_to = tweet.quoted_status.id
                        in_response_to_user_id = tweet.quoted_status.user._json['id_str'] #probably not required
                    else:
                        in_response_to = '0'
            else:
                response_type = 'reply'


            tweetText = ''
            try:
                tweetText = tweetText + tweet.extended_tweet['full_text']
            except:
                try:
                    tweetText = tweetText + tweet.full_text
                except:
                    pass

            try:
                tweetText = tweetText + ' <retweeted_status> ' + tweet.retweeted_status.extended_tweet['full_text'] + ' </retweeted_status>'
            except:
                try:
                    tweetText = tweetText + ' <retweeted_status> ' + tweet.retweeted_status.text + ' </retweeted_status>'
                except:
                    pass

            try:
                tweetText = tweetText + ' <quoted_status> ' + tweet.quoted_status.extended_tweet['full_text'] + ' </quoted_status>'
            except:
                try:
                    tweetText = tweetText + ' <quoted_status> ' + tweet.quoted_status.text + ' </quoted_status>'
                except:
                    pass

            if 'urls' in tweet.entities:
                for url in tweet.entities['urls']:
                    try:
                        tweetText = tweetText.replace(url['url'], url['expanded_url'])
                    except:
                        pass

            write_csv([tweet.created_at, tweet.id, tweetText, tweet.favorite_count, tweet.retweet_count,            
            tweet.user.screen_name, tweet.user._json['id_str'], tweet.user._json['created_at'],in_response_to, 
            in_response_to_user_id ,response_type, tweet.user.geo_enabled, tweet.user.verified, tweet.user.statuses_count, 
            tweet.user.followers_count, tweet.user.friends_count, tweet.user.favourites_count, tweet.user.listed_count
            ,tweet.user.profile_use_background_image, tweet.user.protected, tweet.user.default_profile])

    
    rescraped = pd.read_csv(get_root_dir() + '/data/temp/rescraped.csv')
    profile = pd.read_csv(os.path.join(get_root_dir(), 'data/cleaned_profile.csv'))
    
    original['Time'] = pd.to_datetime(original['Time'])

    original['Time'] = original['Time'].astype(int) // 10**9
    rescraped_df, rescraped_profile = processor(rescraped)
    non_existing = to_scrape[~to_scrape['index'].isin(rescraped['id'])]

    virtual_tweets = non_existing['index'].apply(make_tweets, original=original)
    rescraped = pd.concat([virtual_tweets, rescraped_df]).reset_index(drop=True)
    virtual_tweets['User'] = virtual_tweets['User'].str.lower()
    rescrape = virtual_tweets[~virtual_tweets['User'].isin(profile['username'])]
    ps = profileScraper()
    scraped = ps.query_profile(rescrape['User'].values)
    scraped_profile = process_scraped_profile(scraped)

    new_profile = pd.concat([scraped_profile, rescraped_profile, profile]) #clean them seperately before concating
    new_profile = new_profile.drop_duplicates(subset=['username']).reset_index(drop=True)
    new_profile.to_csv(os.path.join(get_root_dir(), 'data/cleaned_profile.csv'), index=None)

    rescraped = get_sentiment(rescraped)
    new_df = pd.concat([original, rescraped])
    new_df = new_df.sort_values('Time')

    return new_df, new_profile

def get_influence(df):
    df = df.reset_index(drop=True)

    p_ij = P(df,r = -0.000068)
    inf, m_ij = influence(p_ij)
    df['inf'] = inf
    df = df[['ID', 'inf', 'cascade']]
    return df

def get_influence_metrics(df):
    curr = {}
    curr['total_tweets'] = len(df)
    curr['total_influence'] = df['inf'].sum()
    curr['avg_influence'] = curr['total_tweets'] / curr['total_influence']
             
    return pd.Series(curr)

def add_influence_and_all(df):
    d = df.groupby('cascade').apply(get_influence)
    d = d.drop_duplicates()
    df = df.merge(d, on='ID')
    df = df.drop('cascade_y', axis=1).rename(columns={'cascade_x':'cascade'})
    new_inf = df.groupby('user_id').apply(get_influence_metrics)

    new_inf = new_inf.reset_index().rename(columns={'user_id': 'username'})
    return new_inf

def add_inf(curr_inf, new_inf):
    combined = new_inf.merge(curr_inf, how='outer', on='username')
    combined = combined.fillna(0)
    combined['total_tweets'] = combined['total_tweets_x'] + combined['total_tweets_y']
    combined['total_influence'] = combined['total_influence_x'] + combined['total_influence_y']
    combined = combined[['username', 'total_tweets', 'total_influence']]
    combined['avg_influence'] = combined['total_influence']/combined['total_tweets']
    combined.sort_values('total_influence', ascending=False)
    
    return combined

def sub_inf(combined_inf, to_remove):
    combined = combined_inf.merge(to_remove, how='outer', on='username')
    combined = combined.fillna(0)
    combined['total_tweets'] = combined['total_tweets_x'] - combined['total_tweets_y']
    combined['total_influence'] = combined['total_influence_x'] - combined['total_influence_y']
    combined = combined[['username', 'total_tweets', 'total_influence']]
    combined['avg_influence'] = combined['total_influence']/combined['total_tweets']
    combined.sort_values('total_influence', ascending=False)
    
    return combined

In [3]:
dir = get_root_dir()
storagefolder = os.path.join(dir, 'data/storage/all_cleaned')

storagesfiles = glob(storagefolder + "/*")
combined = merge_csvs(storagesfiles)
df = pd.read_csv(combined)

In [4]:
df['Time'] = pd.to_datetime(df['Time'])
df = create_cascades(df) 

In [5]:
df = df.sort_values('Time')

In [6]:
counts = df['cascade'].value_counts().reset_index()
ids_count = counts[counts['cascade'] > 3][['index']]
non_existing = ids_count[~ids_count['index'].isin(df['ID'])]

In [7]:
df, profile = rescrape_and_add(df, non_existing)
df = df.merge(profile[['username', 'total_followers']], left_on='User', right_on='username', how='inner')
df = df.rename(columns={'Time': 'time', 'total_followers': 'magnitude', 'User': 'user_id'})
counts = df['cascade'].value_counts().reset_index()

df = df[df['cascade'].isin(counts[counts['cascade'] > 2]['index'])]
oldcascade_file = os.path.join(dir, 'data/storage/old_cascade.csv')
df = df[['ID', 'time', 'magnitude', 'user_id', 'cascade']]

Rescraping 2152 tweets
0 100
100 200
200 300
300 400
400 500
500 600
600 700
700 800
800 900
900 1000
1000 1100
1100 1200
1200 1300
1300 1400
1400 1500
1500 1600
1600 1700
1700 1800
1800 1900
1900 2000
2000 2100
2100 2200


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['timestamp'] = pd.to_datetime(df['timestamp'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['timestamp'] = df['timestamp'].astype(np.int64) // 10**9


Querying https://twitter.com/hqair
Querying https://twitter.com/marthaauwx97
Querying https://twitter.com/qalqaari
Querying https://twitter.com/reynajareg00
Querying https://twitter.com/arenamatchgold
Querying https://twitter.com/tezos
Querying https://twitter.com/btsv_preview
Querying https://twitter.com/brittneymu96
Querying https://twitter.com/skrillscam
Querying https://twitter.com/bluefiretoken
Querying https://twitter.com/dope_tokens
Querying https://twitter.com/z2007e04
Querying https://twitter.com/iskra_coin
Querying https://twitter.com/soundeo_token
Querying https://twitter.com/griffexco
Querying https://twitter.com/mag_bitnerox
Querying https://twitter.com/russian_coin
Querying https://twitter.com/dcto_erc20token
Querying https://twitter.com/dope_tokens
Querying https://twitter.com/iskra_coin
Got an error scraping: 'NoneType' object is not subscriptable
Querying https://twitter.com/heachogtyan_k
Got an error scraping: 'NoneType' object is not subscriptable
Querying https://tw

  cop.to_csv(tempFile, index=None, header=None)


java -jar /home/warproxxx/Desktop/Projects/crypto-analysis-live/algorithm/utils/SentiStrength.jar sentidata /home/warproxxx/Desktop/Projects/crypto-analysis-live/algorithm/utils/SentiStrength_Data/ input /home/warproxxx/Desktop/Projects/crypto-analysis-live/algorithm/data/temp/tweets


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [8]:
df

Unnamed: 0,ID,time,magnitude,user_id,cascade
3,1159811247698825216,1565355524,3465,perrymetzger,1.159797e+18
5,1159819991849590786,1565357609,3465,perrymetzger,1.159818e+18
9,1159868877926477824,1565369264,3465,perrymetzger,1.159868e+18
10,1159869105522036737,1565369318,3465,perrymetzger,1.159869e+18
12,1159870636124246016,1565369683,3465,perrymetzger,1.159870e+18
16,1159873297552744448,1565370318,3465,perrymetzger,1.159873e+18
17,1159874813235728385,1565370679,3465,perrymetzger,1.159875e+18
53,1159924553600184321,1565382538,3465,perrymetzger,1.159924e+18
124,1160190282572414981,1565445893,3465,perrymetzger,1.160190e+18
143,1159881701205708800,1565372321,6976,robustus,1.159797e+18


In [9]:
oldcascade_file

'/home/warproxxx/Desktop/Projects/crypto-analysis-live/algorithm/data/storage/old_cascade.csv'

In [10]:
#check this later
if os.path.isfile(oldcascade_file):
    old_file = pd.read_csv(oldcascade_file)
    old_file = old_file[old_file['cascade'].isin(df['cascade'])]
    df.to_csv(oldcascade_file, index=None)

    df = pd.concat([df, old_file])
    df = df.reset_index()
else:
    df.to_csv(oldcascade_file, index=None)

In [None]:
new_inf = add_influence_and_all(df)