In [1]:
import sys
from DB import DB
import pprint
from collections import defaultdict
from sortedcontainers import SortedListWithKey
import statistics
import requests
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string
from xml.etree import ElementTree
from auth import AzureAuthClient
import requests
import pickle
from nltk.stem import PorterStemmer
from datetime import datetime

In [2]:
def get_collection(argv):
    database = DB(argv[0], argv[1])
    collection = database.get_collection(argv[2])

    return collection

In [3]:
def get_results(collection, query, projection):
    #cursor_it = collection.find(query,projection)
    #cursor_it = collection.find()
    cursor_it = collection.find({},{"_id":-1, "id":1, "user.followers_count":1,"favorite_count":1, 
                    "retweet_count":1, "entities.hashtags":1, "text":1,
                               "processed_text":1, "created_at":1})
    return list(cursor_it)
'''
    
'''

'\n    \n'

In [13]:
def analytics(cursor):
    vocabulary = defaultdict(int)
    top_retweeted = SortedListWithKey(key=lambda x: x["retweet_count"])
    top_fav = SortedListWithKey(key=lambda x: x["favorite_count"])
    top_followed = SortedListWithKey(key=lambda x: x["user"]['followers_count'])
    prices = []
    
    for tweet in cursor:
        flag = False #look for prices
        clues = ["bearish", "bullish","hold","stock","share"]
        intersection_1 = [word for word in tweet["processed_text"] if word in clues]
        # intersection_2 = [word for word in tweet["processed_text"] if word in ["€", "EUR", 'eur']]
    
    
        text = tweet["processed_text"]
    
        if intersection_1:
            if "bmw" in text:
                flag = True
                #print(tweet['text'])
    
        for word in tweet["processed_text"]:
            if flag:
                try:
                    number = float(word)
                    if number < 100 and number > 50:
                        prices.append(number)
                except:
                    pass
         
            vocabulary[word] += 1
    
        top_retweeted.add(tweet)
        if len(top_retweeted) > 200:
            top_retweeted.pop(0)

        top_fav.add(tweet)
        if len(top_fav) > 100:
            top_fav.pop(0)
                
        top_followed.add(tweet)
        if len(top_followed) > 100:
            top_followed.pop(0)
    
    return vocabulary, top_retweeted, top_fav, top_followed, prices

In [5]:
def print_prices(prices):

    prices.sort(reverse=True)
    total = sum(prices)
    print("Average price: {}".format(total/len(prices)))
    print("Lowest price: {}".format(prices[0]))
    print("Highest price: {}".format(prices[-1]))

In [9]:
def process_german_tweets(cursor, collection):
    stop_words = set(stopwords.words("german"))
    tokenizer = TweetTokenizer(preserve_case=False,reduce_len=True,strip_handles=False)
    punct = string.punctuation
    punct_1 = punct.replace('#', '')
    punct_2 = punct_1.replace('@', '')
    stop_words.update(punct_2)
    stop_words.add('...')
    
    for tweet in cursor:
        if 'processed_text_de' not in tweet:
            tokens = tokenizer.tokenize(tweet['text'])
            filtered_tokens = [word for word in tokens if not word in stop_words]
            filtered_no_url = [word for word in filtered_tokens if not 'http' in word]
            collection.update_one({"id":tweet["id"]}, 
                                { '$set': { 'processed_text_de': filtered_no_url}})
            collection.update_one({"id":tweet["id"]}, 
                                { '$unset': { 'processed_text': 1}})

In [4]:
def preprocess_tweet(text:str, language):
    if language == 'en':
        lang = 'english'
    if language == 'de':
        lang = 'german'

    #Tokenize the tweet text
    tokenizer = TweetTokenizer(preserve_case=False,reduce_len=True,strip_handles=False)
    tokens = tokenizer.tokenize(text)
    #remove stop words
    stop_words = set(stopwords.words('english'))
    punct = string.punctuation
    punct_1 = punct.replace('#', '')
    punct_2 = punct_1.replace('@', '')
    stop_words.update(punct_2)
    stop_words.add('...')

    filtered_tokens = [word for word in tokens if not word in stop_words]
    filtered_no_url = [word for word in filtered_tokens if not 'http' in word]

    #stemming
    if language == 'en':
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(word) if (word[0] != '#' and word[0] != '@' ) else word for word in filtered_no_url]
        return stemmed_tokens
    if language == 'german':
        return filtered_no_url

    return None

In [10]:
def process(cursor, collection):
    for tweet in cursor:
        if tweet['text_en'] is not None:
            processed_text = preprocess_tweet(tweet['text_en'], 'en')
        else:
            processed_text = ""
        
        collection.update_one({"id":tweet["id"]}, 
                                { '$set': { 'processed_text': processed_text}})

In [24]:
def save_translation(cursor,collection):
    temp_list = []
    tokenizer = TweetTokenizer(preserve_case=False,reduce_len=True,strip_handles=False)
    client_secret = 'db36d42fd17b43bbbacacbaf545e513c'
    auth_client = AzureAuthClient(client_secret)
    bearer_token = b'Bearer ' + auth_client.get_access_token()
    finalToken = bearer_token
    headers = {"Authorization ": finalToken}
    
    for tweet in cursor:
        tokens = tokenizer.tokenize(tweet['text'])
        filtered_no_url = [word for word in tokens if not 'http' in word]
        
        text_to_translate = " ".join(filtered_no_url)
        
        translateUrl = "http://api.microsofttranslator.com/v2/Http.svc/Translate?text={}&to={}".format(text_to_translate, 'en')
        translationData = requests.get(translateUrl, headers = headers)
        # parse xml return values
        translation = ElementTree.fromstring(translationData.text.encode('utf-8'))
        temp_list.append((tweet["id"],translation))
    
    pickle.dump( temp_list, open( "translations.p", "wb" ) )
    return temp_list
                
    #collection.update_one({"id":tweet["id"]}, { '$set': { 'text_en': translation}})

In [5]:
collection = get_collection(["mongodb://127.0.0.1:27017","dax", "tweets_de"])
cursor = get_results(collection,{},{})

In [14]:
vocabulary, top_retweeted, top_fav, top_followed, prices = analytics(cursor)

In [None]:
for item in reversed(top_fav):
    pprint.pprint(item['text_en'])

In [None]:
for item in reversed(top_followed):
    pprint.pprint(item['text_en'])

In [None]:
for item in top_retweeted:
    pprint.pprint(item['text'])

In [None]:
sorted_vocabulary = sorted(vocabulary, key=vocabulary.__getitem__, reverse=True)
for i in range(0,41):
    print("{},{}".format(sorted_vocabulary[i], vocabulary[sorted_vocabulary[i]]))

In [None]:
for key in vocabulary.keys():
    if key == 'recal':
        print("Recall",vocabulary[key])
    if key == 'sale':
        print('Sales',vocabulary[key])
    if key == 'financi':
        print('Financial',vocabulary[key])
    if key == 'bearish':
        print('Bearish',vocabulary[key])
    if key == 'Bullish':
        print('Bullish',vocabulary[key])
print(len(vocabulary))

In [6]:
for tweet in cursor:
    tweet['date'] = datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S %z %Y')

In [7]:
cursor.sort(key=lambda d: d['date'])

In [8]:
print(cursor[0]['date'])
print(cursor[-1]["date"])

2017-07-18 08:37:50+00:00
2017-08-01 15:36:20+00:00
