<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"></ul></div>

In [44]:
import pandas as pd
from collections import Counter

from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('../data/clean_text.csv')

In [28]:
df_prices = pd.read_csv('../data/train/tickers_train.csv')
df_prices = df_prices[df_prices['ticker'] == 'ETH'].copy()

In [37]:
df_prices['timestamp'] = pd.to_datetime(df_prices['datetime'])

In [38]:
df_prices['timestamp'] = df_prices['timestamp'].apply(lambda x: x.timestamp())

In [7]:
df = df[df['clean_text'].str.contains('eth')].copy()

In [8]:
tf_idf = TfidfVectorizer()
raw_vectors = tf_idf.fit_transform(df['clean_text'])

In [11]:
pca = TruncatedSVD(n_components=30)
pca_vectors = pca.fit_transform(raw_vectors)

In [48]:
CLUSTERS_COUNT = 20
clusterizer = KMeans(n_clusters=CLUSTERS_COUNT)
clusters = clusterizer.fit_transform(pca_vectors)

In [22]:
cluster_ind = clusters.argmax(axis=1)

In [23]:
df['cluster'] = cluster_ind

In [32]:
df_prices['change'] = df_prices.priceUsd.pct_change().fillna(0)

In [33]:
df_prices['price_grow'] = df_prices['change'] > 0

In [34]:
df_prices['price_grow'].value_counts()

True     373
False    359
Name: price_grow, dtype: int64

In [55]:
WINDOW = 10
cluster_counts = []
target = []
for i, row in df_prices.iterrows():
    ts_end = row['timestamp']
    ts_start = ts_end - WINDOW * 2 * 3600
    clusters = df[(df['timestamp'] > ts_start) & (df['timestamp'] < ts_end)]['cluster'].values
    counter = Counter(clusters)
    row_cluster_counts = [counter[cluster] for cluster in range(CLUSTERS_COUNT)]
    cluster_counts.append(row_cluster_counts)
    target.append(int(row['price_grow']))

In [56]:
model = LogisticRegression()
model.fit(cluster_counts, target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [64]:
cluster_counts = df['cluster'].value_counts()

In [66]:
data = []
for i, coef in enumerate(model.coef_[0]):
    data.append([
        i, coef, cluster_counts.get(i, 0)
    ])

In [68]:
d = pd.DataFrame(data, columns=['cluster_no', 'coef', 'count'])

In [71]:
d.sort_values('coef')[:5]

Unnamed: 0,cluster_no,coef,count
16,16,-0.02379,719
17,17,-0.011442,2
8,8,-0.00489,618
2,2,-0.002708,5457
0,0,0.0,0


In [73]:
d.sort_values('coef', ascending=False)[:10]

Unnamed: 0,cluster_no,coef,count
15,15,0.128515,3
19,19,0.12294,75
10,10,0.114625,16
1,1,0.011865,1370
13,13,0.008765,134
4,4,0.005993,12
7,7,0.005309,3698
18,18,0.001196,17684
3,3,0.0,0
14,14,0.0,0


In [78]:
print('GROW')
texts = ''
for text in df[df['cluster'] == 1]['text'][:10]:
    texts += text
    print(text)

GROW
tokenpay and verge both together are going to fuck btc , ethereum and other alts.
you guys better hurry buy some ethereum before it goes to the moon, and buy indahash
i think there are dozens of people that are waiting the update wether to decide to buy more or sell
you mean the smart people rode the waves and bailed when it couldnt sustain the price, it is going down to the real price now. you should be buying more if you are a believer instead of calling people idiots. there is no perfect way to introduce something where the apes dont get a jiggy
i guess will be no ripple .. more like bnb, btc ,eth
i pump few more eth wait for more idiot lol
listen, maybe it will, maybe it won't, buy you no what, this makes sensem don't like it, piss off. stip wasting your time. go for a drink, get laid, do something other than shit post all day.
should we be buying more eos
 as they say its going to replace eth
so, does doing this turn you on or something? like, are you jerking off at the same 

In [82]:
Counter(texts.split()).most_common(50)

[('to', 10),
 ('the', 10),
 ('are', 7),
 ('and', 6),
 ('it', 6),
 ('more', 6),
 ('you', 6),
 ('buy', 4),
 ('of', 4),
 ('be', 4),
 ('going', 3),
 ('people', 3),
 ('a', 3),
 ('no', 3),
 ('something', 3),
 ('at', 3),
 ('btc', 2),
 ('ethereum', 2),
 ('other', 2),
 ('guys', 2),
 ('there', 2),
 ('or', 2),
 ('is', 2),
 ('buying', 2),
 ('get', 2),
 ('like', 2),
 ('for', 2),
 ('maybe', 2),
 ('this', 2),
 ('tokenpay', 1),
 ('verge', 1),
 ('both', 1),
 ('together', 1),
 ('fuck', 1),
 (',', 1),
 ('alts.you', 1),
 ('better', 1),
 ('hurry', 1),
 ('some', 1),
 ('before', 1),
 ('goes', 1),
 ('moon,', 1),
 ('indahashi', 1),
 ('think', 1),
 ('dozens', 1),
 ('that', 1),
 ('waiting', 1),
 ('update', 1),
 ('wether', 1),
 ('decide', 1)]

In [83]:
print('DOWN')
texts = ''
for text in df[df['cluster'] == 16]['text'][:10]:
    texts += text
    print(text)

DOWN
sold my xvg yesterday and now eth rising up, god bless me yihaaa
neo eth btc nothing else 😎
its only fappening eth can even flip btc... btc flips market
buy btc or eth from coinbase via cc
good project neo ltc eth is deep.   your advise is rubbish
type to the search ltc. there is i thing bnb, btc, eth :)
eth more profit...than convert to btc...
okay understood now... eth from my designated mew
i prefer eth coz btc is volatile
they mine whatever coin is profitable, not just eth


In [84]:
Counter(texts.split()).most_common(50)

[('eth', 9),
 ('is', 5),
 ('btc', 4),
 ('my', 2),
 ('from', 2),
 ('to', 2),
 ('sold', 1),
 ('xvg', 1),
 ('yesterday', 1),
 ('and', 1),
 ('now', 1),
 ('rising', 1),
 ('up,', 1),
 ('god', 1),
 ('bless', 1),
 ('me', 1),
 ('yihaaaneo', 1),
 ('nothing', 1),
 ('else', 1),
 ('😎its', 1),
 ('only', 1),
 ('fappening', 1),
 ('can', 1),
 ('even', 1),
 ('flip', 1),
 ('btc...', 1),
 ('flips', 1),
 ('marketbuy', 1),
 ('or', 1),
 ('coinbase', 1),
 ('via', 1),
 ('ccgood', 1),
 ('project', 1),
 ('neo', 1),
 ('ltc', 1),
 ('deep.', 1),
 ('your', 1),
 ('advise', 1),
 ('rubbishtype', 1),
 ('the', 1),
 ('search', 1),
 ('ltc.', 1),
 ('there', 1),
 ('i', 1),
 ('thing', 1),
 ('bnb,', 1),
 ('btc,', 1),
 (':)eth', 1),
 ('more', 1),
 ('profit...than', 1)]