# Analiza Podatkov: Daily Discussion

In [1]:
import numpy as np

import glob

import os

## Nalaganje podatkov

### Cena Bitcoinov

In [2]:
# Funkcije za parsanje podatkov

def parse_prices_file(file):
    prices = []
    
    first_line = True
    for line in file:
        if (first_line):
            first_line = False
            continue
        
        line_split = line.split(";")

        price = float(line_split[2])
        
        prices.append(price)
    
    return prices

In [3]:
# Naloži podatke

bitstamp_prices_file = open("../data/price/bitstamp_bitcoin.txt")

bitstamp_prices = parse_prices_file(bitstamp_prices_file)

In [4]:
# Izračuna gibanje cene

bitstamp_price_correctPredictions = []

for i in range(len(bitstamp_prices) -1):
    price = bitstamp_prices[i]
    nextPrice = bitstamp_prices[i +1]
    
    if nextPrice > price:
        correct_prediction = 1
    if nextPrice == price:
        correct_prediction = 0
    if nextPrice < price:
        correct_prediction = -1
        
    bitstamp_price_correctPredictions.append(correct_prediction)
        
    
print("Correct price predictions (for Bitstamp):")
print(bitstamp_price_correctPredictions)

Correct price predictions (for Bitstamp):
[1, -1, -1, -1, 1, -1, -1, 1, 1]


### Reddit

In [5]:
# Funkcije za parsanje podatkov

def parse_submission_file(file):
    comments = []
    for line_num, line in enumerate(file):
        if line_num == 0:
            file_creationTime = int(line[17:])
            continue
        if line_num == 1:
            subreddit = line[11:]
            continue
        if line_num == 2:
            submission_title = line[18:]
            continue
        if line_num == 3:
            submission_creationTime = int(line[24:])
            continue
        if line_num == 4:
            submission_score = int(line[18:])
            continue
        if line_num == 5:
            continue
        if line_num == 6:
            continue
        comments.append(pase_comment_line(line))
    return (file_creationTime, 
            subreddit, 
            submission_title, 
            submission_creationTime,
            submission_score,
            comments)

def pase_comment_line(line):
    line_split = line.split("\t")
    level = int(line_split[0])
    creationTime = int(line_split[1])
    score = int(line_split[2])
    content = line_split[3]
    
    return (level,
            creationTime,
            score,
            content)

In [6]:
# Naloži vse potrebne podatke

def get_dd_dates(file_names):
    r = []
    for file_name in file_names:
        split = file_name.split("/")
        date = split[-1].replace(".txt", "")
        r.append(date)
    return r

def load_daily_discussion(file_names):
    r = []
    for file_name in file_names:
        file = open(file_name)
        r.append(parse_submission_file(file))
    return r



# naloži DailyDiscussion-Bitcoin
bitcoinDD_filenames = sorted(glob.glob("../data/reddit/daily_discussion_bitcoin/*.txt"))

bitcoinDD_submissions = load_daily_discussion(bitcoinDD_filenames)

# naloži DailyDiscussion-Cryptocurrency
cryptocurrencyDD_filenames = sorted(glob.glob("../data/reddit/daily_discussion_cryptocurrency/*.txt"))

cryptocurrencyDD_submissions = load_daily_discussion(cryptocurrencyDD_filenames)

# prodobi datume za katere obstajajo podatki
dd_dates = get_dd_dates(bitcoinDD_filenames)

## Izračun

In [7]:
# Pomožne funkcije

def totalNumOfCommentsForSubmission(submission):
    comments = submission[5]
    return len(comments)

def commentScoreForSubmission(submission):
    comments = submission[5]
    r = 0
    for comment in comments:
        r += comment[2]
    return r

def weightedCommentScoreForSubmission(submission):
    comments = submission[5]
    r = 0
    for comment in comments:
        comment_level = comment[0]
        comment_score = comment[2]
        
        # (1 - (level * 0.1)) * score
        weighted_score = (1 - (comment_level * 0.1)) * comment_score
        r += weighted_score
    return r


### Bitcoin

In [8]:
# Število vseh komentarjev
bitcoinDD_totalNumOfComments = []

for submission in bitcoinDD_submissions:
    bitcoinDD_totalNumOfComments.append(totalNumOfCommentsForSubmission(submission))
    
print("Bitcoin total number of comments:")
print(bitcoinDD_totalNumOfComments, "\n")

# Score komentarjev
bitcoinDD_commentScore = []

for submission in bitcoinDD_submissions:
    bitcoinDD_commentScore.append(commentScoreForSubmission(submission))
    
print("Bitcoin comment score:")
print(bitcoinDD_commentScore, "\n")

# Weighted Score Komentarjev
bitcoinDD_weightedCommentScore = []

for submission in bitcoinDD_submissions:
    bitcoinDD_weightedCommentScore.append(weightedCommentScoreForSubmission(submission))

print("Bitcoin weighted comment score:")
print(bitcoinDD_weightedCommentScore, "\n")

Bitcoin total number of comments:
[173, 163, 306, 243, 390, 210, 107, 419, 195, 147] 

Bitcoin comment score:
[237, 218, 505, 371, 627, 229, 150, 621, 313, 229] 

Bitcoin weighted comment score:
[203.8000000000001, 189.00000000000014, 439.8999999999993, 313.80000000000024, 490.49999999999955, 189.70000000000022, 133.90000000000006, 516.299999999999, 258.8000000000002, 205.50000000000017] 



### CryptoCurrency

In [9]:
# Število vseh komentarjev
cryptocurrencyDD_totalNumOfComments = []

for submission in cryptocurrencyDD_submissions:
    cryptocurrencyDD_totalNumOfComments.append(totalNumOfCommentsForSubmission(submission))

print("CryptoCurrency total number of comments:")
print(cryptocurrencyDD_totalNumOfComments, "\n")

# Score komentarjev
cryptocurrencyDD_commentScore = []

for submission in cryptocurrencyDD_submissions:
    cryptocurrencyDD_commentScore.append(commentScoreForSubmission(submission))
    
print("CryptoCurrency comment score:")
print(cryptocurrencyDD_commentScore, "\n")

# Weighted Score Komentarjev
cryptocurrencyDD_weightedCommentScore = []

for submission in cryptocurrencyDD_submissions:
    cryptocurrencyDD_weightedCommentScore.append(weightedCommentScoreForSubmission(submission))

print("CryptoCurrency weighted comment score:")
print(cryptocurrencyDD_weightedCommentScore, "\n")

CryptoCurrency total number of comments:
[467, 498, 493, 479, 499, 456, 302, 475, 474, 464] 

CryptoCurrency comment score:
[2654, 2833, 2339, 2448, 2576, 1512, 972, 2369, 2062, 1920] 

CryptoCurrency weighted comment score:
[2430.4000000000037, 2632.1000000000017, 2192.399999999999, 2272.6000000000017, 2373.5000000000005, 1369.400000000001, 870.8999999999991, 2202.7000000000035, 1899.000000000001, 1729.1999999999975] 



### Skupaj

In [10]:
# Šrevilo vseh komenrarjev
dd_totalNumOfComments = [0 for day in dd_dates]

for i, num in enumerate(bitcoinDD_totalNumOfComments):
    dd_totalNumOfComments[i] += num
for i, num in enumerate(cryptocurrencyDD_totalNumOfComments):
    dd_totalNumOfComments[i] += num

print("Combined total number of comments:")
print(dd_totalNumOfComments, "\n")

# Score komentarjev
dd_commentScore = [0 for day in dd_dates]

for i, num in enumerate(bitcoinDD_commentScore):
    dd_commentScore[i] += num
for i, num in enumerate(cryptocurrencyDD_commentScore):
    dd_commentScore[i] += num


print("Combined comment score:")
print(dd_commentScore, "\n")

# Weighted Score Komentarjev
dd_weightedCommentScore = [0 for day in dd_dates]

for i, num in enumerate(bitcoinDD_weightedCommentScore):
    dd_weightedCommentScore[i] += num
for i, num in enumerate(cryptocurrencyDD_weightedCommentScore):
    dd_weightedCommentScore[i] += num
    
print("Combined weighted comment score:")
print(dd_weightedCommentScore, "\n")


Combined total number of comments:
[640, 661, 799, 722, 889, 666, 409, 894, 669, 611] 

Combined comment score:
[2891, 3051, 2844, 2819, 3203, 1741, 1122, 2990, 2375, 2149] 

Combined weighted comment score:
[2634.200000000004, 2821.1000000000017, 2632.2999999999984, 2586.400000000002, 2864.0, 1559.1000000000013, 1004.7999999999992, 2719.0000000000027, 2157.800000000001, 1934.6999999999978] 



## Ocenjevanje: glede na povprečje

In [12]:
# _predictions = price:
#    * Increase -> 1
#    * Same -> 0
#    * Decrease -> -1

### Skupno število komenrarjev

In [13]:
totalNumOfComments_data = dd_totalNumOfComments

dd_totalNumOfComments_mean = np.mean(totalNumOfComments_data)

print("Mean total number of comments:")
print(dd_totalNumOfComments_mean)

Mean total number of comments:
696.0


In [14]:
# Makes predictions

dd_totalNumOfComments_mean_predictions = []

for numOfComments in dd_totalNumOfComments:
    if numOfComments > dd_totalNumOfComments_mean:
        prediction = 1
    if numOfComments == dd_totalNumOfComments_mean:
        prediction = 0
    if numOfComments < dd_totalNumOfComments_mean:
        prediction = -1
    
    dd_totalNumOfComments_mean_predictions.append(prediction)
    
# removes last prediction becouse there is no correct-prediction data for it
dd_totalNumOfComments_mean_predictions = dd_totalNumOfComments_mean_predictions[:-1]

print("Predictions:")
print(dd_totalNumOfComments_mean_predictions)

Predictions:
[-1, -1, 1, 1, 1, -1, -1, 1, -1]


In [15]:
# Evaluates the result

numOfCorrect = 0
numOfIncorrect = 0

print("Price     ", "Actual", "Predicted", "wasCorrect")
print("---------------------------------------")
for i in range(len(dd_totalNumOfComments_mean_predictions)):
    date = dd_dates[i +1] # there is no price prediction for first date
    prediction = dd_totalNumOfComments_mean_predictions[i]
    correct_prediction = bitstamp_price_correctPredictions[i]
    
    wasCorrect = prediction == correct_prediction
    if wasCorrect:
        numOfCorrect += 1
    else:
        numOfIncorrect += 1
    
    print(date, str(correct_prediction).rjust(6), str(prediction).rjust(9), str(wasCorrect).rjust(10))

print()
print(str(numOfCorrect) + "/" + str(numOfCorrect + numOfIncorrect))

Price      Actual Predicted wasCorrect
---------------------------------------
2018-04-03      1        -1      False
2018-04-04     -1        -1       True
2018-04-05     -1         1      False
2018-04-06     -1         1      False
2018-04-07      1         1       True
2018-04-08     -1        -1       True
2018-04-09     -1        -1       True
2018-04-10      1         1       True
2018-04-11      1        -1      False

5/9


### Skupen score komentarjev

In [16]:
commentsScore_data = dd_commentScore

dd_commentsScore_mean = np.mean(commentsScore_data)

print("Mean comments score:")
print(dd_commentsScore_mean)

Mean comments score:
2518.5


In [17]:
# Makes predictions

dd_commentsScore_mean_predictions = []

for commentsScore in dd_commentScore:
    if commentsScore > dd_commentsScore_mean:
        prediction = 1
    if commentsScore == dd_commentsScore_mean:
        prediction = 0
    if commentsScore < dd_commentsScore_mean:
        prediction = -1
    
    dd_commentsScore_mean_predictions.append(prediction)
    
# removes last prediction becouse there is no correct-prediction data for it
dd_commentsScore_mean_predictions = dd_commentsScore_mean_predictions[:-1]

print("Predictions:")
print(dd_commentsScore_mean_predictions)

Predictions:
[1, 1, 1, 1, 1, -1, -1, 1, -1]


In [18]:
# Evaluates the result

numOfCorrect = 0
numOfIncorrect = 0

print("Price     ", "Actual", "Predicted", "wasCorrect")
print("---------------------------------------")
for i in range(len(dd_commentsScore_mean_predictions)):
    date = dd_dates[i +1] # there is no price prediction for first date
    prediction = dd_commentsScore_mean_predictions[i]
    correct_prediction = bitstamp_price_correctPredictions[i]
    
    wasCorrect = prediction == correct_prediction
    if wasCorrect:
        numOfCorrect += 1
    else:
        numOfIncorrect += 1
    
    print(date, str(correct_prediction).rjust(6), str(prediction).rjust(9), str(wasCorrect).rjust(10))

print()
print(str(numOfCorrect) + "/" + str(numOfCorrect + numOfIncorrect))

Price      Actual Predicted wasCorrect
---------------------------------------
2018-04-03      1         1       True
2018-04-04     -1         1      False
2018-04-05     -1         1      False
2018-04-06     -1         1      False
2018-04-07      1         1       True
2018-04-08     -1        -1       True
2018-04-09     -1        -1       True
2018-04-10      1         1       True
2018-04-11      1        -1      False

5/9


### Skupen utežen score komentarjev

In [19]:
weightedCommentsScore_data = dd_weightedCommentScore

dd_weightedCommentsScore_mean = np.mean(weightedCommentsScore_data)

print("Mean weighted comments score:")
print(dd_weightedCommentsScore_mean)

Mean weighted comments score:
2291.34


In [20]:
# Makes predictions

dd_weightedCommentsScore_mean_predictions = []

for weightedCommentsScore in dd_weightedCommentScore:
    if weightedCommentsScore > dd_weightedCommentsScore_mean:
        prediction = 1
    if weightedCommentsScore == dd_weightedCommentsScore_mean:
        prediction = 0
    if weightedCommentsScore < dd_weightedCommentsScore_mean:
        prediction = -1
    
    dd_weightedCommentsScore_mean_predictions.append(prediction)
    
# removes last prediction becouse there is no correct-prediction data for it
dd_weightedCommentsScore_mean_predictions = dd_weightedCommentsScore_mean_predictions[:-1]

print("Predictions:")
print(dd_weightedCommentsScore_mean_predictions)

Predictions:
[1, 1, 1, 1, 1, -1, -1, 1, -1]


In [21]:
# Evaluates the result

numOfCorrect = 0
numOfIncorrect = 0

print("Price     ", "Actual", "Predicted", "wasCorrect")
print("---------------------------------------")
for i in range(len(dd_weightedCommentsScore_mean_predictions)):
    date = dd_dates[i +1] # there is no price prediction for first date
    prediction = dd_weightedCommentsScore_mean_predictions[i]
    correct_prediction = bitstamp_price_correctPredictions[i]
    
    wasCorrect = prediction == correct_prediction
    if wasCorrect:
        numOfCorrect += 1
    else:
        numOfIncorrect += 1
    
    print(date, str(correct_prediction).rjust(6), str(prediction).rjust(9), str(wasCorrect).rjust(10))

print()
print(str(numOfCorrect) + "/" + str(numOfCorrect + numOfIncorrect))

Price      Actual Predicted wasCorrect
---------------------------------------
2018-04-03      1         1       True
2018-04-04     -1         1      False
2018-04-05     -1         1      False
2018-04-06     -1         1      False
2018-04-07      1         1       True
2018-04-08     -1        -1       True
2018-04-09     -1        -1       True
2018-04-10      1         1       True
2018-04-11      1        -1      False

5/9


## Ocenjevanje: glede na prejšnji dan

In [22]:
bitstamp_price_yesterdayCorrectPredictions = bitstamp_price_correctPredictions[1:]

print(bitstamp_price_yesterdayCorrectPredictions)

[-1, -1, -1, 1, -1, -1, 1, 1]


### Skupno število komenrarjev

In [23]:
# Makes predictions

dd_totalNumOfComments_yesterday_predictions = []

yesterday_numOfComments = -1
for numOfComments in dd_totalNumOfComments:
    if yesterday_numOfComments == -1:
        yesterday_numOfComments = numOfComments
        continue
    
    if numOfComments > yesterday_numOfComments:
        prediction = 1
    if numOfComments == yesterday_numOfComments:
        prediction = 0
    if numOfComments < yesterday_numOfComments:
        prediction = -1
        
    yesterday_numOfComments = numOfComments
    
    dd_totalNumOfComments_yesterday_predictions.append(prediction)
    
    
# removes first prediction becouse there is no correct-prediction data for it
dd_totalNumOfComments_yesterday_predictions = dd_totalNumOfComments_yesterday_predictions[:-1]

print("Predictions:")
print(dd_totalNumOfComments_yesterday_predictions)

Predictions:
[1, 1, -1, 1, -1, -1, 1, -1]


In [24]:
# Evaluates the result

numOfCorrect = 0
numOfIncorrect = 0

print("Price     ", "Actual", "Predicted", "wasCorrect")
print("---------------------------------------")
for i in range(len(dd_totalNumOfComments_yesterday_predictions)):
    date = dd_dates[i +1] # there is no price prediction for first date
    prediction = dd_totalNumOfComments_yesterday_predictions[i]
    correct_prediction = bitstamp_price_yesterdayCorrectPredictions[i]
    
    wasCorrect = prediction == correct_prediction
    if wasCorrect:
        numOfCorrect += 1
    else:
        numOfIncorrect += 1
    
    print(date, str(correct_prediction).rjust(6), str(prediction).rjust(9), str(wasCorrect).rjust(10))

print()
print(str(numOfCorrect) + "/" + str(numOfCorrect + numOfIncorrect))

Price      Actual Predicted wasCorrect
---------------------------------------
2018-04-03     -1         1      False
2018-04-04     -1         1      False
2018-04-05     -1        -1       True
2018-04-06      1         1       True
2018-04-07     -1        -1       True
2018-04-08     -1        -1       True
2018-04-09      1         1       True
2018-04-10      1        -1      False

5/8


### Skupen score komentarjev

In [25]:
# Makes predictions

dd_commentsScore_yesterday_predictions = []

yesterday_commentsScore = -1
for commentsScore in dd_commentScore:
    if yesterday_commentsScore == -1:
        yesterday_commentsScore = commentsScore
        continue
    
    if commentsScore > yesterday_commentsScore:
        prediction = 1
    if commentsScore == yesterday_commentsScore:
        prediction = 0
    if commentsScore < yesterday_commentsScore:
        prediction = -1
        
    yesterday_commentsScore = commentsScore
    
    dd_commentsScore_yesterday_predictions.append(prediction)
    
    
# removes first prediction becouse there is no correct-prediction data for it
dd_commentsScore_yesterday_predictions = dd_commentsScore_yesterday_predictions[:-1]

print("Predictions:")
print(dd_commentsScore_yesterday_predictions)

Predictions:
[1, -1, -1, 1, -1, -1, 1, -1]


In [26]:
# Evaluates the result

numOfCorrect = 0
numOfIncorrect = 0

print("Price     ", "Actual", "Predicted", "wasCorrect")
print("---------------------------------------")
for i in range(len(dd_commentsScore_yesterday_predictions)):
    date = dd_dates[i +1] # there is no price prediction for first date
    prediction = dd_commentsScore_yesterday_predictions[i]
    correct_prediction = bitstamp_price_yesterdayCorrectPredictions[i]
    
    wasCorrect = prediction == correct_prediction
    if wasCorrect:
        numOfCorrect += 1
    else:
        numOfIncorrect += 1
    
    print(date, str(correct_prediction).rjust(6), str(prediction).rjust(9), str(wasCorrect).rjust(10))

print()
print(str(numOfCorrect) + "/" + str(numOfCorrect + numOfIncorrect))

Price      Actual Predicted wasCorrect
---------------------------------------
2018-04-03     -1         1      False
2018-04-04     -1        -1       True
2018-04-05     -1        -1       True
2018-04-06      1         1       True
2018-04-07     -1        -1       True
2018-04-08     -1        -1       True
2018-04-09      1         1       True
2018-04-10      1        -1      False

6/8


### Skupen utežen score komentarjev

In [27]:
# Makes predictions

dd_weightedCommentsScore_yesterday_predictions = []

yesterday_commentsScore = -1
for commentsScore in dd_weightedCommentScore:
    if yesterday_commentsScore == -1:
        yesterday_commentsScore = commentsScore
        continue
    
    if commentsScore > yesterday_commentsScore:
        prediction = 1
    if commentsScore == yesterday_commentsScore:
        prediction = 0
    if commentsScore < yesterday_commentsScore:
        prediction = -1
        
    yesterday_commentsScore = commentsScore
    
    dd_weightedCommentsScore_yesterday_predictions.append(prediction)
    
    
# removes first prediction becouse there is no correct-prediction data for it
dd_weightedCommentsScore_yesterday_predictions = dd_weightedCommentsScore_yesterday_predictions[:-1]

print("Predictions:")
print(dd_weightedCommentsScore_yesterday_predictions)

Predictions:
[1, -1, -1, 1, -1, -1, 1, -1]


In [28]:
# Evaluates the result

numOfCorrect = 0
numOfIncorrect = 0

print("Price     ", "Actual", "Predicted", "wasCorrect")
print("---------------------------------------")
for i in range(len(dd_weightedCommentsScore_yesterday_predictions)):
    date = dd_dates[i +1] # there is no price prediction for first date
    prediction = dd_weightedCommentsScore_yesterday_predictions[i]
    correct_prediction = bitstamp_price_yesterdayCorrectPredictions[i]
    
    wasCorrect = prediction == correct_prediction
    if wasCorrect:
        numOfCorrect += 1
    else:
        numOfIncorrect += 1
    
    print(date, str(correct_prediction).rjust(6), str(prediction).rjust(9), str(wasCorrect).rjust(10))

print()
print(str(numOfCorrect) + "/" + str(numOfCorrect + numOfIncorrect))

Price      Actual Predicted wasCorrect
---------------------------------------
2018-04-03     -1         1      False
2018-04-04     -1        -1       True
2018-04-05     -1        -1       True
2018-04-06      1         1       True
2018-04-07     -1        -1       True
2018-04-08     -1        -1       True
2018-04-09      1         1       True
2018-04-10      1        -1      False

6/8
