In [1]:
from profilescraper import query_profile
import pandas as pd
import itertools

import os

from glob import glob

import numpy as np

from proxy_utils import proxy_dict, get_proxies

import numba

In [2]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

%load_ext autoreload
%autoreload 2

In [3]:
def profiles_to_pandas(profiles):
    userDf = pd.DataFrame(columns=['username', 'location', 'has_location', 'age', 'is_verified', 'total_tweets', 'total_following', 'total_followers', 'total_likes', 'total_moments', 'total_lists', 'has_avatar', 'has_background', 'is_protected', 'profile_modified', 'tweets'])
    tweetDf = pd.DataFrame(columns=['User', 'ID', 'Tweet', 'Time', 'Likes', 'Replies', 'Retweet'])

    for profile in profiles:   
        for tweet in profile.tweets:
            tweetDf = tweetDf.append({'User': profile.username, 'ID': tweet.id, 'Tweet': tweet.text, 'Time': tweet.timestamp, 'Likes': tweet.likes, 'Replies': tweet.replies, 'Retweet': tweet.retweets}, ignore_index=True)

        userDf = userDf.append({'username':profile.username, 'location':profile.location, 'has_location':profile.has_location, 'age':profile.age, 'is_verified':profile.is_verified, 'total_tweets':profile.total_tweets, 'total_following':profile.total_following, 'total_followers':profile.total_followers, 'total_likes':profile.total_likes, 'total_moments':profile.total_moments, 'total_lists':profile.total_lists, 'has_avatar':profile.has_avatar, 'has_background':profile.has_background, 'is_protected':profile.is_protected, 'profile_modified':profile.profile_modified}, ignore_index=True)

    tweetDf = tweetDf.to_csv('profiledata/userTweets.csv', index=None, mode='a')
    userDf['username'].to_csv('profiledata/extractedUsers.csv', index=None, mode='a')
    userDf.to_csv('profiledata/userData.csv', index=None, mode='a')
    
    print("Saved to userTweets.csv and extractedUsers.csv")

In [4]:
def scrape_list(currList, poolsize, proxy, count):
        
    if (len(currList) > 0):
        profiles = query_profile(currList, poolsize=poolsize, proxy=proxy)
        profiles_to_pandas(profiles)

        count += 1
    
    return count

In [5]:
def perform_extraction(coinname, poolsize=20):
    proxies = get_proxies()
    proxySize = len(proxies)
    
    users = list(set(pd.read_csv('{}/extracted/combined.csv'.format(coinname), dtype=str)['User']))
    
    try:
        alreadyRead = pd.read_csv('profiledata/extractedUsers.csv', header=None)[0]
    except FileNotFoundError:
        logging.info("Already extracted users not found - Starting from a clean slate")
        os.mknod("profiledata/extractedUsers.csv")
        alreadyRead = pd.Series()
        
    
    uniqueUsers = list(set(users) - set(alreadyRead))
    
    print("File contains {} data. Scraping for {} after cache".format(len(users), len(uniqueUsers)))
    
    oldi = 0
    count = 0
    
    for i in range(0, len(uniqueUsers), poolsize*5):
        count = scrape_list(uniqueUsers[oldi:i], poolsize=poolsize, proxy=proxies[count], count=count)
        
        if (count >= proxySize):
            count = 0
        
        logging.info("Done {} of {}".format(i, len(uniqueUsers)))
        oldi = i
    
    scrape_list(uniqueUsers[i:], poolsize=poolsize, proxy=None, count=count)

In [None]:
for files in glob(os.getcwd() + "/*"):
    if (os.path.exists(files + "/extracted")):
        print("Extracting for {}".format(files))
        perform_extraction(files.split("/")[-1], poolsize=30)

In [6]:
def print_details():
    allUsers = pd.DataFrame()

    for files in glob(os.getcwd() + "/*"):
        if (os.path.exists(files + "/extracted")):
            fname = files + "/extracted/combined.csv"
            tDf = pd.read_csv(fname)
            print("Reading from {}".format(fname))
            allUsers = pd.concat([allUsers, tDf['User']])

In [7]:
@numba.jit
def clean_files():
    userData = pd.read_csv(os.getcwd() + "/profiledata/userData.csv")
    userTweets = pd.read_csv(os.getcwd() + "/profiledata/userTweets.csv")
    
    userTweets = userTweets.rename(columns={'User': 'username'})
    
    merged = pd.merge(userData, userTweets, how='inner', on=['username'])    
    newuserData = merged[userData.columns]
    newuserData = newuserData.set_index('username').drop_duplicates().reset_index()
    
    newuserTweets = merged[userTweets.columns]
    newuserTweets = newuserTweets.rename(columns={'username': 'User'})
    
    newuserTweets = newuserTweets.set_index(['User', 'ID']).drop_duplicates().reset_index()
    
    return userData, userTweets, newuserData, newuserTweets

In [8]:
userData, userTweets, newuserData, newuserTweets = clean_files()

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
len(set(userData['username']))

289816

In [None]:
len(set(userTweets['username']))

In [17]:
len(set(newuserData['username']))

277024

In [19]:
len(set(newuserTweets['User']))

277023