# Starter Script (Change REPO_PATH at Start)

In [34]:
# REPO_PATH, for easier standardization
from os import path
REPO_PATH = "/Users/tks/Documents/GitHub/cs591_sns_ipo/" # change me

# all imports
import re, string
import random
import json as json
from nltk import word_tokenize
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import os.path as path
from os import makedirs, listdir

# ------------------------------------
# Helper functions for sentiment analysis
# ------------------------------------
def tweets_to_string(data):    
    list = []   
    for key in data:
        list += [data[key]['text']]
    return list
def retweet_count(data):
    list = []   
    for key in data:
        list += [data[key]['retweet_count']]
    return list
def favorite_count(data):
    list = []   
    for key in data:
        list += [data[key]['favorite_count']]
    return list

def sentiment_analyzer_scores(text):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(text)
    lb = score['compound']
    if lb >= 0.05:
        return 1
    elif (lb > -0.05) and (lb < 0.05):
        return 0
    else:
        return -1
    
# ------------------------------------
# Helper functions for tweet scores
# ------------------------------------

# Score calculation:
#     0. Score as is ((fav_count + rt_count) / engagement_total)
#     1. Add 1
#     2. Add Average Score
#     3. If tweets have no engagement, set the engagement to be 1 to avoid 0
def compute_score_0(ticker, output_path="/Users/tks/Documents/GitHub/cs591_sns_ipo/output/"):
    DIR_PATH = path.join(output_path, ticker)
    
    json_list = sorted(listdir(DIR_PATH))
    json_list = [x for x in json_list if (x.endswith(".json") and x[:-5].isdigit())]
    
    master_file_path = path.join(DIR_PATH, "master.json")
    with open(master_file_path) as json_file:
        data = json.load(json_file)
    
    rt_sum, fav_sum = 0, 0
    
    for i in range(len(json_list)):
        test_key = i
        rt_sum += int(data[str(test_key)]["retweet_count"])
        fav_sum += int(data[str(test_key)]["favorite_count"])
    
    engagement_total = rt_sum + fav_sum
    
    score_list = []
        
    for i in range(len(json_list)):
        test_key = i
        rt_count = int(data[str(test_key)]["retweet_count"])
        fav_count = int(data[str(test_key)]["favorite_count"])
        
        score = ((rt_count + fav_count) / engagement_total)
        score_list.append(score)
    
    return score_list

def compute_score_1(ticker, output_path="/Users/tks/Documents/GitHub/cs591_sns_ipo/output/"):
    DIR_PATH = path.join(output_path, ticker)
    
    json_list = sorted(listdir(DIR_PATH))
    json_list = [x for x in json_list if (x.endswith(".json") and x[:-5].isdigit())]
    
    master_file_path = path.join(DIR_PATH, "master.json")
    with open(master_file_path) as json_file:
        data = json.load(json_file)
    
    rt_sum, fav_sum = 0, 0
    
    for i in range(len(json_list)):
        test_key = i
        rt_sum += int(data[str(test_key)]["retweet_count"])
        fav_sum += int(data[str(test_key)]["favorite_count"])
    
    engagement_total = rt_sum + fav_sum
    
    score_list = []
        
    for i in range(len(json_list)):
        test_key = i
        rt_count = int(data[str(test_key)]["retweet_count"])
        fav_count = int(data[str(test_key)]["favorite_count"])
        
        score = ((rt_count + fav_count) / engagement_total) + 1
        score_list.append(score)
    
    return score_list

def compute_score_2(ticker, output_path="/Users/tks/Documents/GitHub/cs591_sns_ipo/output/"):
    DIR_PATH = path.join(output_path, ticker)
    
    json_list = sorted(listdir(DIR_PATH))
    json_list = [x for x in json_list if (x.endswith(".json") and x[:-5].isdigit())]
    
    master_file_path = path.join(DIR_PATH, "master.json")
    with open(master_file_path) as json_file:
        data = json.load(json_file)
    
    rt_sum, fav_sum = 0, 0
    
    for i in range(len(json_list)):
        test_key = i
        rt_sum += int(data[str(test_key)]["retweet_count"])
        fav_sum += int(data[str(test_key)]["favorite_count"])
    
    engagement_total = rt_sum + fav_sum
    score_total = 0
        
    avg_score = 1 / len(json_list)

    score_list = []
    for i in range(len(json_list)):
        test_key = i
        rt_count = int(data[str(test_key)]["retweet_count"])
        fav_count = int(data[str(test_key)]["favorite_count"])
        
        score = ((rt_count + fav_count) / engagement_total) + avg_score
        score_list.append(score)
    
    return score_list

def compute_score_3(ticker, output_path="/Users/tks/Documents/GitHub/cs591_sns_ipo/output/"):
    DIR_PATH = path.join(output_path, ticker)
    
    json_list = sorted(listdir(DIR_PATH))
    json_list = [x for x in json_list if (x.endswith(".json") and x[:-5].isdigit())]
    
    master_file_path = path.join(DIR_PATH, "master.json")
    with open(master_file_path) as json_file:
        data = json.load(json_file)
    
    rt_sum, fav_sum = 0, 0
    min_score = float("inf")
    for i in range(len(json_list)):
        test_key = i
        rt_count = int(data[str(test_key)]["retweet_count"])
        fav_count = int(data[str(test_key)]["favorite_count"])
        
        rt_sum += rt_count
        fav_sum += fav_count
        engagement = rt_count + fav_count
        
        if engagement > 0 and engagement < min_score:
            min_score = engagement

    engagement_total = rt_sum + fav_sum

    score_list = []
    for i in range(len(json_list)):
        test_key = i
        rt_count = int(data[str(test_key)]["retweet_count"])
        fav_count = int(data[str(test_key)]["favorite_count"])
        
        score = ((rt_count + fav_count) / engagement_total) + min_score
        score_list.append(score)
    
    return score_list

def compute_score_4(ticker, output_path="/Users/tks/Documents/GitHub/cs591_sns_ipo/output/"):
    DIR_PATH = path.join(output_path, ticker)
    
    json_list = sorted(listdir(DIR_PATH))
    json_list = [x for x in json_list if (x.endswith(".json") and x[:-5].isdigit())]
    
    master_file_path = path.join(DIR_PATH, "master.json")
    with open(master_file_path) as json_file:
        data = json.load(json_file)
    
    rt_sum, fav_sum = 0, 0
    
    for i in range(len(json_list)):
        test_key = i
        rt_sum += int(data[str(test_key)]["retweet_count"])
        fav_sum += int(data[str(test_key)]["favorite_count"])
    
    engagement_total = rt_sum + fav_sum
    
    score_list = []
        
    for i in range(len(json_list)):
        test_key = i
        rt_count = int(data[str(test_key)]["retweet_count"])
        fav_count = int(data[str(test_key)]["favorite_count"])
        
        if rt_count + fav_count == 0:
            score = (1 / engagement_total)            
        else:
            score = ((rt_count + fav_count) / engagement_total)
        score_list.append(score)
    
    return score_list

def add_scores(df, ticker):
    output_path = path.join(REPO_PATH, f"output/")

    lst_0 = []
    lst_1 = []
    lst_2 = []
    lst_3 = []

    scores_0 = compute_score_0(ticker, output_path=output_path)
    scores_1 = compute_score_1(ticker, output_path=output_path)
    scores_2 = compute_score_2(ticker, output_path=output_path)
    scores_3 = compute_score_3(ticker, output_path=output_path)
        
    for i, l in enumerate(df["compound"]):
        lst_0.append(l * scores_0[i])
        lst_1.append(l * scores_1[i])
        lst_2.append(l * scores_2[i])
        lst_3.append(l * scores_3[i])

    df["weighted_compound_0"] = lst_0
    df["score_0"] = scores_0
    df["weighted_compound_1"] = lst_1
    df["score_1"] = scores_1
    df["weighted_compound_2"] = lst_2
    df["score_2"] = scores_2
    df["weighted_compound_3"] = lst_3
    df["score_3"] = scores_3

    return df

# ------------------------------------
# Helper functions for utility
# ------------------------------------

def save_df(df, title=None, save_path=None):
    PATH_TO_REPO = "/Users/tks/Documents/GitHub/cs591_sns_ipo" # change me
    
    if save_path is None:
        save_path = path.join(PATH_TO_REPO, "sentiment_data")
    if not path.isdir(save_path):
        makedirs(save_path)
    if title is None:
        title = "Untitled_" + str(len(listdir(save_path)))
    
    save_file = path.join(save_path, title)
    
    df.to_csv(save_file)

# Initial Stuff

In [35]:
import re, string
import random
import json as json
from nltk import word_tokenize
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier

stop_words = stopwords.words('english')

with open('master.json') as f:
    palantir = json.load(f)
    
def tweets_to_string(data):    
    list = []   
    for key in data:
        list += [data[key]['text']]
    return list
        
palantir_tweets = tweets_to_string(palantir)

positive_tweets = twitter_samples.strings('positive_tweets.json')

negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
palantir_tweet_tokens = [word_tokenize(i) for i in palantir_tweets]

    

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

def remove_noise(tweet_tokens, stop_words = ()):
    cleaned_tokens = []
    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
palantir_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
    
for tokens in palantir_tweet_tokens:
    palantir_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
    
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)
all_neg_words = get_all_words(negative_cleaned_tokens_list)
all_palantir_words = get_all_words(palantir_cleaned_tokens_list)

#freq_dist_pos = FreqDist(all_pos_words)
#print(freq_dist_pos.most_common(10))

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
palantir_tokens_for_model = get_tweets_for_model(palantir_cleaned_tokens_list)

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

palantir_positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in palantir_tokens_for_model]

palantir_negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in palantir_tokens_for_model]

dataset = positive_dataset + negative_dataset
palantir_dataset = palantir_positive_dataset + palantir_negative_dataset

random.shuffle(dataset)
random.shuffle(palantir_dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, palantir_dataset))

print(classifier.show_most_informative_features(100))

FileNotFoundError: [Errno 2] No such file or directory: 'master.json'

# Palantir Info

In [36]:
TWEETS_PATH = path.join(REPO_PATH, "output/PLTR/master.json")

#get the tweets from the json
with open(TWEETS_PATH) as f:
    palantir = json.load(f)
    
palantir_tweets = tweets_to_string(palantir) #array of related tweets
retweets = retweet_count(palantir)
favorites = favorite_count(palantir)

#print(palantir_tweets)

# calculate the sentiment for each tweet 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for tweet in palantir_tweets:
    score = analyser.polarity_scores(tweet)
    scores.append(score)
    
# print the tweet and the associated score
# i = 0
# for i in range(len(palantir_tweets)):
    # print(str(i+1) + ".json: "+ palantir_tweets[i] + ", score = " + json.dumps(scores[i]))

txtblob = []
for tweet in palantir_tweets:
    #print(TextBlob(tweet).sentiment.polarity)
    txtblob.append(TextBlob(tweet).sentiment.polarity)
    
vader = []
for tweet in palantir_tweets:
    tmp = sentiment_analyzer_scores(tweet) 
    vader.append(tmp)
    
#Converting List of Dictionaries into Dataframe
dataFrame = pd.DataFrame(scores)
dataFrame['vader'] = vader
dataFrame['txtblob'] = txtblob
dataFrame['retweets'] = retweets
dataFrame['favorites'] = favorites
print(dataFrame)
#dataFrame.mean()

dataFrame = add_scores(dataFrame, "PLTR")
save_df(dataFrame, "PLTR_Tweets.csv")

      neg    neu    pos  compound  vader   txtblob  retweets  favorites
0    0.00  1.000  0.000    0.0000      0  0.000000         0          1
1    0.00  0.759  0.241    0.6908      1  0.350000         0          0
2    0.00  0.759  0.241    0.5859      1  0.600000         0          0
3    0.00  1.000  0.000    0.0000      0  0.000000         0          0
4    0.00  0.794  0.206    0.6369      1  0.200000         1          6
..    ...    ...    ...       ...    ...       ...       ...        ...
392  0.11  0.890  0.000   -0.3818     -1 -0.500000         6         31
393  0.00  0.854  0.146    0.3400      1  0.100000         0          1
394  0.00  1.000  0.000    0.0000      0  0.183168         1          1
395  0.00  0.905  0.095    0.2960      1  0.000000         0          0
396  0.00  0.586  0.414    0.5473      1  0.800000         0          0

[397 rows x 8 columns]


# Palantir After INFO

In [37]:
TWEETS_PATH = path.join(REPO_PATH, "output/PLTR/after/after_master.json")

#get the tweets from the json
with open(TWEETS_PATH) as f:
    palantir = json.load(f)
    

palantir_tweets = tweets_to_string(palantir) #array of related tweets
retweets = retweet_count(palantir)
favorites = favorite_count(palantir)

#print(palantir_tweets)

# calculate the sentiment for each tweet 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for tweet in palantir_tweets:
    score = analyser.polarity_scores(tweet)
    scores.append(score)
    
# print the tweet and the associated score
# i = 0
# for i in range(len(palantir_tweets)):
    # print(str(i+1) + ".json: "+ palantir_tweets[i] + ", score = " + json.dumps(scores[i]))

txtblob = []
for tweet in palantir_tweets:
    #print(TextBlob(tweet).sentiment.polarity)
    txtblob.append(TextBlob(tweet).sentiment.polarity)
    
    
vader = []
for tweet in palantir_tweets:
    tmp = sentiment_analyzer_scores(tweet) 
    vader.append(tmp)
    
#Converting List of Dictionaries into Dataframe
dataFrame = pd.DataFrame(scores)
dataFrame['vader'] = vader
dataFrame['txtblob'] = txtblob
dataFrame['retweets'] = retweets
dataFrame['favorites'] = favorites
print(dataFrame)
#dataFrame.mean()

save_df(dataFrame, "PLTR_after_Tweets.csv")

       neg    neu    pos  compound  vader   txtblob  retweets  favorites
0    0.000  1.000  0.000    0.0000      0  0.062500         0          0
1    0.089  0.813  0.098    0.0701      1  0.261111         0          1
2    0.000  1.000  0.000    0.0000      0  0.000000         0          0
3    0.000  1.000  0.000    0.0000      0 -0.155556         0          3
4    0.000  1.000  0.000    0.0000      0  0.000000         0          0
..     ...    ...    ...       ...    ...       ...       ...        ...
198  0.000  1.000  0.000    0.0000      0  0.062500         0          0
199  0.165  0.670  0.165    0.0000      0  0.216667         0          1
200  0.000  1.000  0.000    0.0000      0  0.062500         0          0
201  0.000  0.698  0.302    0.6369      1  0.033333         0          0
202  0.000  1.000  0.000    0.0000      0  0.200000         0          3

[203 rows x 8 columns]


# PLTR NEWS INFO

In [38]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# news goes in order bloomberg, marketwatch, cnbc, yahoo finance, seeking alpha
# title, teaser text
palantir_news = [["Bloomberg", "Palantir Direct Listing Reference Price Set at $7.25 by NYSE", "The New York Stock Exchange has set a reference price of $7.25 for the direct listing of Palantir Technologies Inc., the data mining company co-founded by technology billionaire Peter Thiel. No shares change hands at the reference price, though it acts as a guide to investors and is determined based on recent secondary trading performance. At $7.25 a share, Palantir would have a market value of about $16 billion on a fully diluted basis. The reference price doesn’t dictate where the shares will start trading and at what Palantir’s actual valuation will be."],
                ["MarketWatch", "Private Market Trading in Palantir Shared Surged Ahead of Direct Listing", "Private market trading in Palantir TEchnologies shares accelerated after the first reports in June that the big software company was planning to go public, according to data from SharesPost, a company that makes markets in pre-IPO venture-backed companies. Palantir will open for trading on Wednesday in a direct listing on the New York Stock Exchange under the ticker PLTR. SharesPost has been transacting in Palantir shares since 2013, conducting $231 million in trades covering a total 38 million shares, the company said in an interview with Barron’s."],
                ["CNBC", "Palantir reference price for direct listing comes in at $7.25 a share, NYSE says", "The New York Stock Exchange said on Tuesday that the reference price for Palantir’s direct listing is $7.25 a share. Based on a fully-diluted share count of 2.17 billion shares outstanding, which includes unvested options and stock, the reference price would indicate a valuation of $15.7 billion, below its private valuation of $20.4 billion in 2015. Palantir provides software and data analytics services to help government agencies and large companies make sense of vast amounts of information. Co-founders include Peter Thiel, the billionaire Facebook investor and supporter of President Donald Trump’s 2016 presidential campaign, who initially funded Palantir and remains its biggest shareholder.  The company is pursuing a direct listing instead of a traditional IPO, meaning the company isn’t issuing new shares and is instead allowing existing shareholders to sell stock to new investors. The reference price is typically reflective of recent private market trades does not necessarily point to where the stock will open on Wednesday."],
                ["Yahoo Finance", "Yahoo Finance Palantir eyes $22B market debut — here's what the secretive big-data firm does", "Palantir (PLTR), the secretive big-data firm co-founded by billionaire PayPal (PYPL) co-founder and Facebook (FB) investor Peter Thiel, will make its stock market debut via a direct listing on Sept. 30 with a valuation estimated at $22 billion. Much of what Palantir does and how it uses its troves of data is opaque to all but the most dedicated followers. Founded in 2004 with funding from the CIA’s not-for-profit venture capital arm In-Q-Tel, Palantir is named for mystical orbs in J.R.R. Tolkien’s “The Lord of the Rings” universe that can see both the past and present and allow users to communicate over vast distances. That’s not exactly far afield of how Palantir itself operates. It provides customized software to clients analyzing large swaths of data for purposes ranging from finding suspected criminals to improving companies’ manufacturing capabilities."],
                ["Seeking Alpha", "Palantir direct listing reference price set at $7.25","Data mining firm Palantir (PLTR) will also have a direct listing tomorrow and the reference price was set at $7.25. In the past month, Palantir's weighted average price on the private market rose from $7.31 to $9.17. Last week, WSJ sources said bankers thought Palantir would start trading at around $10/share."]]

# calculate the sentiment for each news source title, teaser 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for news in palantir_news:
    title_score = analyser.polarity_scores(news[1])
    teaser_score = analyser.polarity_scores(news[2])
    title_score['source'] = news[0]
    teaser_score['source'] = news[0]
    title_score['type'] = "title"
    teaser_score['type'] = "teaser"
    scores.append(title_score)
    scores.append(teaser_score)
    
dataFrame = pd.DataFrame(scores)
print(dataFrame)

save_df(dataFrame, "PLTR_News.csv")

     neg    neu    pos  compound         source    type
0  0.000  1.000  0.000    0.0000      Bloomberg   title
1  0.019  0.888  0.093    0.7440      Bloomberg  teaser
2  0.000  0.806  0.194    0.3400    MarketWatch   title
3  0.000  0.924  0.076    0.7063    MarketWatch  teaser
4  0.000  0.855  0.145    0.2960           CNBC   title
5  0.000  0.895  0.105    0.9393           CNBC  teaser
6  0.000  1.000  0.000    0.0000  Yahoo Finance   title
7  0.049  0.881  0.070    0.4748  Yahoo Finance  teaser
8  0.000  1.000  0.000    0.0000  Seeking Alpha   title
9  0.000  1.000  0.000    0.0000  Seeking Alpha  teaser


# KC INFO

In [39]:
TWEETS_PATH = path.join(REPO_PATH, "output/KC//master.json")

#get the tweets from the json
with open(TWEETS_PATH) as f:
    palantir = json.load(f)


palantir_tweets = tweets_to_string(palantir) #array of related tweets
retweets = retweet_count(palantir)
favorites = favorite_count(palantir)

#print(palantir_tweets)

# calculate the sentiment for each tweet 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for tweet in palantir_tweets:
    score = analyser.polarity_scores(tweet)
    scores.append(score)
    
# print the tweet and the associated score
# i = 0
# for i in range(len(palantir_tweets)):
    # print(str(i+1) + ".json: "+ palantir_tweets[i] + ", score = " + json.dumps(scores[i]))

txtblob = []
for tweet in palantir_tweets:
    #print(TextBlob(tweet).sentiment.polarity)
    txtblob.append(TextBlob(tweet).sentiment.polarity)
    

    
vader = []
for tweet in palantir_tweets:
    tmp = sentiment_analyzer_scores(tweet) 
    vader.append(tmp)
    
#Converting List of Dictionaries into Dataframe
dataFrame = pd.DataFrame(scores)
dataFrame['vader'] = vader
dataFrame['txtblob'] = txtblob
dataFrame['retweets'] = retweets
dataFrame['favorites'] = favorites
print(dataFrame)
#dataFrame.mean()

dataFrame = add_scores(dataFrame, "KC")
save_df(dataFrame, "KC_Tweets.csv")

      neg    neu    pos  compound  vader   txtblob  retweets  favorites
0   0.000  1.000  0.000    0.0000      0  0.083333         4          1
1   0.286  0.714  0.000   -0.7906     -1 -0.042045         0          0
2   0.000  1.000  0.000    0.0000      0  0.125000         0          0
3   0.000  1.000  0.000    0.0000      0  0.125000         0          0
4   0.000  0.881  0.119    0.4019      1  0.203125         0          2
..    ...    ...    ...       ...    ...       ...       ...        ...
61  0.286  0.714  0.000   -0.7906     -1 -0.042045         0          0
62  0.000  0.879  0.121    0.2960      1  0.000000         0          0
63  0.286  0.714  0.000   -0.7906     -1 -0.042045         0          0
64  0.000  1.000  0.000    0.0000      0  0.000000         2          8
65  0.000  1.000  0.000    0.0000      0  0.125000         0          0

[66 rows x 8 columns]


# KC AFTER INFO

In [40]:
TWEETS_PATH = path.join(REPO_PATH, "output/PLTR/after/after_master.json")

#get the tweets from the json
with open(TWEETS_PATH) as f:
    palantir = json.load(f)
    

palantir_tweets = tweets_to_string(palantir) #array of related tweets
retweets = retweet_count(palantir)
favorites = favorite_count(palantir)

#print(palantir_tweets)

# calculate the sentiment for each tweet 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for tweet in palantir_tweets:
    score = analyser.polarity_scores(tweet)
    scores.append(score)
    
# print the tweet and the associated score
# i = 0
# for i in range(len(palantir_tweets)):
    # print(str(i+1) + ".json: "+ palantir_tweets[i] + ", score = " + json.dumps(scores[i]))

txtblob = []
for tweet in palantir_tweets:
    #print(TextBlob(tweet).sentiment.polarity)
    txtblob.append(TextBlob(tweet).sentiment.polarity)

    
vader = []
for tweet in palantir_tweets:
    tmp = sentiment_analyzer_scores(tweet) 
    vader.append(tmp)
    
#Converting List of Dictionaries into Dataframe
dataFrame = pd.DataFrame(scores)
dataFrame['vader'] = vader
dataFrame['txtblob'] = txtblob
dataFrame['retweets'] = retweets
dataFrame['favorites'] = favorites
print(dataFrame)
#dataFrame.mean()

save_df(dataFrame, "KC_after_Tweets.csv")

       neg    neu    pos  compound  vader   txtblob  retweets  favorites
0    0.000  1.000  0.000    0.0000      0  0.062500         0          0
1    0.089  0.813  0.098    0.0701      1  0.261111         0          1
2    0.000  1.000  0.000    0.0000      0  0.000000         0          0
3    0.000  1.000  0.000    0.0000      0 -0.155556         0          3
4    0.000  1.000  0.000    0.0000      0  0.000000         0          0
..     ...    ...    ...       ...    ...       ...       ...        ...
198  0.000  1.000  0.000    0.0000      0  0.062500         0          0
199  0.165  0.670  0.165    0.0000      0  0.216667         0          1
200  0.000  1.000  0.000    0.0000      0  0.062500         0          0
201  0.000  0.698  0.302    0.6369      1  0.033333         0          0
202  0.000  1.000  0.000    0.0000      0  0.200000         0          3

[203 rows x 8 columns]


# KC NEWS INFO

In [41]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# news goes in order bloomberg, marketwatch, cnbc, yahoo finance, seeking alpha
# title, teaser text
kc_news = [["Bloomberg", "Kingsoft Cloud Jumps in First Big U.S. IPO Since Luckin Fall", "Kingsoft Cloud Holdings Ltd. rose 40% in the first major trading debut by a Chinese company since the accounting scandal at Luckin Coffee Inc.The affiliate of Hong Kong-listed Kingsoft Corp. raised $510 million in its initial public offering, pricing its shares at the midpoint of a $16 to $18 targeted range. The shares closed at $23.84 in New York trading Friday, giving the company a market value of $4.77 billion."],
                ["MarketWatch", "Chinese company Kingsoft Cloud's IPO raises more than half a billion dollars", "While the COVID-19 pandemic has largely stalled the market for initial public offerings, a Chinese cloud-computing company still managed to raise more than half a billion dollars in a U.S. IPO. Kingsoft Cloud Holdings Ltd. KC, -2.44% priced its IPO of 30 million American Depositary Shares at $17 apiece, raising $510 million at a valuation of roughly $3.7 billion. The company priced the shares at the midpoint of its suggested range of $16 to $18, and sold 5 million more shares than it was expecting earlier in the process."],
                ["CNBC", "Chinese cloud firm gears up for $3.8 billion US listing despite coronavirus uncertainty", "Chinese internet and software company, Kingsoft, is spinning off its cloud division and listing it in the U.S. despite the uncertainty in the markets. It comes in the face of a global coronavirus pandemic that has sent stocks tumbling, and the recent fraud scandal with China’s Luckin Coffee."],
                ["Yahoo Finance", "Why Kingsoft Corporation Limited (HKG:3888) Could Be Worth Watching", "Kingsoft Corporation Limited (HKG:3888), which is in the software business, and is based in China, saw a significant share price rise of over 20% in the past couple of months on the SEHK. As a mid-cap stock with high coverage by analysts, you could assume any recent changes in the company’s outlook is already priced into the stock. However, what if the stock is still a bargain? Let’s examine Kingsoft’s valuation and outlook in more detail to determine if there’s still a bargain opportunity."],
                ["Seeking Alpha", "Kingsoft Cloud prices IPO", "Kingsoft Cloud Holdings Limited (KC) has priced its initial public offering of 30M American depositary shares at $17/ADS for a total offering size of ~$510M. Each ADS represents 15 ordinary shares of the Company. Underwriters over-allotment is an additional 4.5M ADSs."]]

# calculate the sentiment for each news source title, teaser 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for news in kc_news:
    title_score = analyser.polarity_scores(news[1])
    teaser_score = analyser.polarity_scores(news[2])
    title_score['source'] = news[0]
    teaser_score['source'] = news[0]
    title_score['type'] = "title"
    teaser_score['type'] = "teaser"
    scores.append(title_score)
    scores.append(teaser_score)
    
dataFrame = pd.DataFrame(scores)
print(dataFrame)

save_df(dataFrame, "KC_News.csv")

     neg    neu    pos  compound         source    type
0  0.000  1.000  0.000    0.0000      Bloomberg   title
1  0.038  0.843  0.119    0.6486      Bloomberg  teaser
2  0.000  1.000  0.000    0.0000    MarketWatch   title
3  0.019  0.907  0.074    0.6240    MarketWatch  teaser
4  0.000  0.855  0.145    0.2584           CNBC   title
5  0.125  0.837  0.038   -0.6872           CNBC  teaser
6  0.176  0.648  0.176    0.0000  Yahoo Finance   title
7  0.021  0.864  0.115    0.7579  Yahoo Finance  teaser
8  0.000  1.000  0.000    0.0000  Seeking Alpha   title
9  0.043  0.858  0.099    0.3612  Seeking Alpha  teaser


# ASAN INFO

In [42]:
TWEETS_PATH = path.join(REPO_PATH, "output/ASAN/master.json")

#get the tweets from the json
with open(TWEETS_PATH) as f:
    palantir = json.load(f)
    

palantir_tweets = tweets_to_string(palantir) #array of related tweets
retweets = retweet_count(palantir)
favorites = favorite_count(palantir)

#print(palantir_tweets)

# calculate the sentiment for each tweet 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for tweet in palantir_tweets:
    score = analyser.polarity_scores(tweet)
    scores.append(score)
    
# print the tweet and the associated score
# i = 0
# for i in range(len(palantir_tweets)):
    # print(str(i+1) + ".json: "+ palantir_tweets[i] + ", score = " + json.dumps(scores[i]))

txtblob = []
for tweet in palantir_tweets:
    #print(TextBlob(tweet).sentiment.polarity)
    txtblob.append(TextBlob(tweet).sentiment.polarity)
    
    
vader = []
for tweet in palantir_tweets:
    tmp = sentiment_analyzer_scores(tweet) 
    vader.append(tmp)
    
#Converting List of Dictionaries into Dataframe
dataFrame = pd.DataFrame(scores)
dataFrame['vader'] = vader
dataFrame['txtblob'] = txtblob
dataFrame['retweets'] = retweets
dataFrame['favorites'] = favorites
print(dataFrame)
#dataFrame.mean()

dataFrame = add_scores(dataFrame, "ASAN")
save_df(dataFrame, "ASAN_Tweets.csv")

      neg    neu  pos  compound  vader   txtblob  retweets  favorites
0   0.000  1.000  0.0    0.0000      0  0.000000         0          0
1   0.000  1.000  0.0    0.0000      0  0.000000         0          0
2   0.000  1.000  0.0    0.0000      0  0.000000         0          1
3   0.000  1.000  0.0    0.0000      0  0.000000         0          2
4   0.000  1.000  0.0    0.0000      0  0.100000         0          0
..    ...    ...  ...       ...    ...       ...       ...        ...
90  0.000  1.000  0.0    0.0000      0  0.000000         0          0
91  0.000  1.000  0.0    0.0000      0 -0.150000         3         11
92  0.000  1.000  0.0    0.0000      0  0.100000         0          0
93  0.087  0.913  0.0   -0.3182     -1  0.118182         0          0
94  0.000  1.000  0.0    0.0000      0  0.100000         2          3

[95 rows x 8 columns]


# ASAN AFTER INFO

In [43]:
TWEETS_PATH = path.join(REPO_PATH, "output/ASAN/after/after_master.json")

#get the tweets from the json
with open(TWEETS_PATH) as f:
    palantir = json.load(f)
    

palantir_tweets = tweets_to_string(palantir) #array of related tweets
retweets = retweet_count(palantir)
favorites = favorite_count(palantir)

#print(palantir_tweets)

# calculate the sentiment for each tweet 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for tweet in palantir_tweets:
    score = analyser.polarity_scores(tweet)
    scores.append(score)
    
# print the tweet and the associated score
# i = 0
# for i in range(len(palantir_tweets)):
    # print(str(i+1) + ".json: "+ palantir_tweets[i] + ", score = " + json.dumps(scores[i]))

txtblob = []
for tweet in palantir_tweets:
    #print(TextBlob(tweet).sentiment.polarity)
    txtblob.append(TextBlob(tweet).sentiment.polarity)
    

    
vader = []
for tweet in palantir_tweets:
    tmp = sentiment_analyzer_scores(tweet) 
    vader.append(tmp)
    
#Converting List of Dictionaries into Dataframe
dataFrame = pd.DataFrame(scores)
dataFrame['vader'] = vader
dataFrame['txtblob'] = txtblob
dataFrame['retweets'] = retweets
dataFrame['favorites'] = favorites
print(dataFrame)
#dataFrame.mean()

save_df(dataFrame, "ASAN_after_Tweets.csv")

       neg    neu    pos  compound  vader  txtblob  retweets  favorites
0    0.000  1.000  0.000    0.0000      0    0.000         0          0
1    0.000  1.000  0.000    0.0000      0    0.000         0          9
2    0.000  1.000  0.000    0.0000      0    0.100         0          0
3    0.000  0.586  0.414    0.8020      1    0.500         0          0
4    0.000  1.000  0.000    0.0000      0    0.000         0          0
..     ...    ...    ...       ...    ...      ...       ...        ...
100  0.223  0.777  0.000   -0.3182     -1    0.000         0          0
101  0.000  1.000  0.000    0.0000      0    0.100         0          0
102  0.000  0.903  0.097    0.4019      1    0.075         0          0
103  0.000  0.811  0.189    0.6800      1    0.000         4         16
104  0.000  1.000  0.000    0.0000      0    0.100         0          1

[105 rows x 8 columns]


# ASAN NEWS INFO

In [44]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# news goes in order bloomberg, marketwatch, cnbc, yahoo finance, seeking alpha
# title, teaser text
asan_news = [["Bloomberg", "Asana's Direct Listing Reference Price Set at $21 by NYSE", "Palantir Technologies isn't the only Peter Thiel-backed company going public this week. Workplace management software firm Asana Inc. is also set to debut through a direct listing, an alternative route to going public last tested in 2019."],
                ["MarketWatch", "IPO market is headed for busiest third quarter since the dot-com years", "The U.S. initial public offering market is expected to see 11 deals and two direct listings this week, adding fuel to what will be the biggest third quarter since the dot.com era. Asana is expected to list with a market value of more than $5 billion, according to Renaissance Capital, an operator of IPO exchange-traded funds and provider of institutional research."],
                ["CNBC", "Stocks making the biggest moves in the premarket: Disney, Micron, Dow Inc., GM & more", "Asana (ASAN) – Asana will also begin trading today on the NYSE, with a reference price of $21 per share for the software company. Palantir and Asana are only the third and fourth companies to go public via direct listing, following Spotify (SPOT) and Slack (WORK)."],
                ["Yahoo Finance", "Asana's Direct Listing IPO: What Investors Need To Know", "Work management company Asana will hit the public markets Wednesday. The Offering: Asana Inc (NYSE: ASAN) will offer 30,030,516 shares in a direct listing. Pricing of the shares will be determined by investor demand prior to going public. NYSE said the reference price for the offering is $21, according to Reuters, although this is not an offering price."],
                ["Seeking Alpha", "Asana plans NYSE direct listing in September - Bloomberg", "Bloomberg sources say workplace productivity software company Asana (ASANA) will turn to the NYSE for its planned direct listing late next month. Nasdaq also reportedly put in a pitch for the deal. Asana's financials could become available as early as next week, and the company will host an analyst day in September. Sources say Asana has been trading for a roughly $5B market value on the secondary market. The details aren't finalized and Asana's plans can still change."]]

# calculate the sentiment for each news source title, teaser 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for news in asan_news:
    title_score = analyser.polarity_scores(news[1])
    teaser_score = analyser.polarity_scores(news[2])
    title_score['source'] = news[0]
    teaser_score['source'] = news[0]
    title_score['type'] = "title"
    teaser_score['type'] = "teaser"
    scores.append(title_score)
    scores.append(teaser_score)
    
dataFrame = pd.DataFrame(scores)
print(dataFrame)

save_df(dataFrame, "ASAN_News.csv")

     neg    neu    pos  compound         source    type
0  0.000  1.000  0.000    0.0000      Bloomberg   title
1  0.000  1.000  0.000    0.0000      Bloomberg  teaser
2  0.000  1.000  0.000    0.0000    MarketWatch   title
3  0.000  0.962  0.038    0.3400    MarketWatch  teaser
4  0.000  1.000  0.000    0.0000           CNBC   title
5  0.000  0.953  0.047    0.2960           CNBC  teaser
6  0.000  1.000  0.000    0.0000  Yahoo Finance   title
7  0.024  0.867  0.109    0.6486  Yahoo Finance  teaser
8  0.000  1.000  0.000    0.0000  Seeking Alpha   title
9  0.000  0.970  0.030    0.3400  Seeking Alpha  teaser


# SNOW INFO

In [45]:
TWEETS_PATH = path.join(REPO_PATH, "output/SNOW/master.json")

#get the tweets from the json
with open(TWEETS_PATH) as f:
    palantir = json.load(f)
    

palantir_tweets = tweets_to_string(palantir) #array of related tweets
retweets = retweet_count(palantir)
favorites = favorite_count(palantir)

#print(palantir_tweets)

# calculate the sentiment for each tweet 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for tweet in palantir_tweets:
    score = analyser.polarity_scores(tweet)
    scores.append(score)
    
# print the tweet and the associated score
# i = 0
# for i in range(len(palantir_tweets)):
    # print(str(i+1) + ".json: "+ palantir_tweets[i] + ", score = " + json.dumps(scores[i]))

txtblob = []
for tweet in palantir_tweets:
    #print(TextBlob(tweet).sentiment.polarity)
    txtblob.append(TextBlob(tweet).sentiment.polarity)
    
    
vader = []
for tweet in palantir_tweets:
    tmp = sentiment_analyzer_scores(tweet) 
    vader.append(tmp)
    
#Converting List of Dictionaries into Dataframe
dataFrame = pd.DataFrame(scores)
dataFrame['vader'] = vader
dataFrame['txtblob'] = txtblob
dataFrame['retweets'] = retweets
dataFrame['favorites'] = favorites
print(dataFrame)
#dataFrame.mean()

dataFrame = add_scores(dataFrame, "SNOW")
save_df(dataFrame, "SNOW_Tweets.csv")

       neg    neu    pos  compound  vader   txtblob  retweets  favorites
0    0.116  0.633  0.251    0.8617      1  0.000000         0          0
1    0.000  0.913  0.087    0.2960      1  0.000000         0          0
2    0.000  0.872  0.128    0.2960      1  0.000000         0          0
3    0.000  0.863  0.137    0.4215      1 -0.250000         0          0
4    0.000  0.828  0.172    0.4019      1  0.000000         5         13
..     ...    ...    ...       ...    ...       ...       ...        ...
595  0.000  1.000  0.000    0.0000      0  0.062500         0          0
596  0.000  0.863  0.137    0.4215      1 -0.250000         0          0
597  0.000  1.000  0.000    0.0000      0  0.000000         0          0
598  0.000  1.000  0.000    0.0000      0  0.000000         0          1
599  0.000  1.000  0.000    0.0000      0  0.383333         0          0

[600 rows x 8 columns]


# SNOW NEWS INFO

In [46]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# news goes in order bloomberg, marketwatch, cnbc, yahoo finance, seeking alpha
# title, teaser text
snow_news = [["Bloomberg", "Snowflake Prices IPO Above Marketed Range at $120 a Share", "All eyes are on cloud-data software maker Snowflake Inc.’s trading debut after it raised $3.36 billion in the year’s biggest U.S. initial public offering for an operating company and the largest in software ever."],
                ["MarketWatch", "Snowflake Raises Its IPO Price BY 30%", "Data storage software company Snowflake has raised its estimated initial public offering price. According to an updated S-1 filing, Snowflake has increased its IPO offering by 30%. Snowflake now expects to go public on September 16 at a share price of between $100 and $110. The company previously said that its shares would debut at between $75 and $85 a share."],
                ["CNBC", "Snowflake prices IPO above increased range, implying initial market cap of $33.3 billion", "Snowflake, a provider of cloud-based data storage and analysis software, priced its IPO above its increased range in an offering that values the company at $33.3 billion."],
                ["Yahoo Finance", "IPO Calendar Filled With Big Deals, Like Warren Buffett Backed Snow", "It's going to be a busy week ahead for initial public offerings, including one backed by Warren Buffett's Berkshire Hathaway (BRKB) that's drawing heightened interest. The IPO calendar is filled with large technology offerings. Berkshire Hathaway plans to plow $570 million into the IPO of Snowflake (SNOW), a company that provides businesses with a tightly integrated data management system. Additionally, the venture capital arm of Salesforce.com (CRM) will invest $250 million. The Snowflake IPO plans to raise $2.2 billion by offering 28 million shares at a price range of $75 to $85. It trades Wednesday under the ticker SNOW."],
                ["Seeking Alpha", "Snowflake prices IPO at $120 per share for $33B valuation - Dow Jones", "That $120 is well above the most recently hoped-for range of $100-$110 (which itself was lifted from $75-$85), and the $33B valuation is miles above the $12.4B valuation Snowflake received in a funding round earlier this year. Shares will begin trading tomorrow on the NYSE under the symbol SNOW."]]

# calculate the sentiment for each news source title, teaser 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for news in snow_news:
    title_score = analyser.polarity_scores(news[1])
    teaser_score = analyser.polarity_scores(news[2])
    title_score['source'] = news[0]
    teaser_score['source'] = news[0]
    title_score['type'] = "title"
    teaser_score['type'] = "teaser"
    scores.append(title_score)
    scores.append(teaser_score)
    
dataFrame = pd.DataFrame(scores)
print(dataFrame)

save_df(dataFrame, "SNOW_News.csv")

   neg    neu    pos  compound         source    type
0  0.0  0.804  0.196    0.2960      Bloomberg   title
1  0.0  1.000  0.000    0.0000      Bloomberg  teaser
2  0.0  1.000  0.000    0.0000    MarketWatch   title
3  0.0  0.868  0.132    0.7717    MarketWatch  teaser
4  0.0  0.851  0.149    0.2732           CNBC   title
5  0.0  0.839  0.161    0.5859           CNBC  teaser
6  0.0  0.714  0.286    0.3818  Yahoo Finance   title
7  0.0  0.938  0.062    0.6486  Yahoo Finance  teaser
8  0.0  0.845  0.155    0.2960  Seeking Alpha   title
9  0.0  0.916  0.084    0.5106  Seeking Alpha  teaser


# U INFO

In [47]:
TWEETS_PATH = path.join(REPO_PATH, "output/U/master.json")

#get the tweets from the json
with open(TWEETS_PATH) as f:
    palantir = json.load(f)
    

palantir_tweets = tweets_to_string(palantir) #array of related tweets
retweets = retweet_count(palantir)
favorites = favorite_count(palantir)

#print(palantir_tweets)

# calculate the sentiment for each tweet 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for tweet in palantir_tweets:
    score = analyser.polarity_scores(tweet)
    scores.append(score)
    
# print the tweet and the associated score
# i = 0
# for i in range(len(palantir_tweets)):
    # print(str(i+1) + ".json: "+ palantir_tweets[i] + ", score = " + json.dumps(scores[i]))

txtblob = []
for tweet in palantir_tweets:
    #print(TextBlob(tweet).sentiment.polarity)
    txtblob.append(TextBlob(tweet).sentiment.polarity)
    
    
vader = []
for tweet in palantir_tweets:
    tmp = sentiment_analyzer_scores(tweet) 
    vader.append(tmp)
    
#Converting List of Dictionaries into Dataframe
dataFrame = pd.DataFrame(scores)
dataFrame['vader'] = vader
dataFrame['txtblob'] = txtblob
dataFrame['retweets'] = retweets
dataFrame['favorites'] = favorites
print(dataFrame)
#dataFrame.mean()

dataFrame = add_scores(dataFrame, "U")
save_df(dataFrame, "U_Tweets.csv")

       neg    neu    pos  compound  vader   txtblob  retweets  favorites
0    0.092  0.690  0.218    0.4939      1  0.800000         1          1
1    0.000  0.833  0.167    0.4588      1  0.312500         0          0
2    0.000  1.000  0.000    0.0000      0 -0.350000         0          0
3    0.098  0.671  0.232    0.4939      1  0.800000         0          0
4    0.000  0.853  0.147    0.4767      1  0.000000         0          0
..     ...    ...    ...       ...    ...       ...       ...        ...
595  0.000  0.891  0.109    0.4215      1  0.184091         0          0
596  0.000  0.862  0.138    0.3400      1 -0.133333         0          0
597  0.000  1.000  0.000    0.0000      0 -0.400000         0          0
598  0.098  0.671  0.232    0.4939      1  0.800000         2          5
599  0.000  1.000  0.000    0.0000      0 -0.300000         0          0

[600 rows x 8 columns]


# U NEWS INFO

In [48]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# news goes in order bloomberg, marketwatch, cnbc, yahoo finance, seeking alpha
# title, teaser text
u_news = [["Bloomberg", "Unity to Use IPO to Move Past Games to Real-World Challenges", "Unity plans to offer 25 million shares at an estimated price of $44 to $48 apiece when it lists on the New York Stock Exchange on Friday, according to filings with the U.S. Securities and Exchange Commission. The company raised it's price range Wednesday from a previous range of $34 to $42 per share. Based on the number of shares outstanding after the offering, the IPO could value Unity at $12.6 billion."],
                ["MarketWatch", "Unity Software raises expected pricing of IPO, lifting what it could raise to up to $1.2 billion", "Unity Software Inc. U, +3.62% raised on Wednesday the expected pricing of its initial public offering to $44 to $48 a share from $34 to $42 a share. With the new expected pricing, the San Francisco-based company, which makes software to create videogames, could raise up to $1.20 billion and be valued at up to $12.64 billion."],
                ["CNBC", "This will be the biggest year ever for software IPOs as coronavirus and work-from-home show value", "The software business is on fire. This is going to be by far the biggest year for software IPOs in history. Four big software IPOs are pricing this week: Snowflake, Unity Software, JFrog, and Sumo Logic. Palantir and Asana are coming at the end of September. Unity Software does software for 3-D games. Half of the top online games use this platform, and the CEO came from Electronic Arts."],
                ["Yahoo Finance", "Unity to Use IPO to Move Past Games to Real-World Challenges", "For more than a decade, game developers have been using a tool called the Unity engine and its easy-to-use tutorials to create more than half of the top 1,000 mobile games available on the Apple Inc. and Google app stores. That simplicity has turned Unity Technologies Inc. into a household name in the industry. But the 16-year-old company behind the engine has ambitions beyond games. Unity is going public this week seeking to raise $1.2 billion to expand into 3D software that helps companies solve a wide range of real-world challenges -- from training new employees to simulating passenger traffic in a busy airport. The stock market debut also comes as one of its biggest competitors, Fortnite creator Epic Games Inc., is locked in a legal battle with Apple."],
                ["Seeking Alpha", "Unity Software raises IPO price range to $44 - $48", "Unity Software (U) has raised the expected price range of its IPO to $44 - $48 per share from $34 - $42 per share. With the new pricing, the company could raise up to $1.2B and be valued at up to $12.64B. The company addresses a total market opportunity of ~$29B across both gaming and other industries. In gaming, the market opportunity for Create Solutions and Operate Solutions to be ~$12B in 2019 across over 15M potential creators, growing to over $16B in 2025 and in industries beyond gaming, the market opportunity for Create Solutions and Operate Solutions to be ~$17B. As of June 30, 2020, the company had ~1.5M monthly active creators in over 190 countries and territories worldwide. The charts below illustrate the growth in customers contributing more than $100K of trailing 12-month revenue, and the percentage of trailing 12-month revenue represented by such customers, as of the end of each of the last ten quarters."]]

# calculate the sentiment for each news source title, teaser 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for news in u_news:
    title_score = analyser.polarity_scores(news[1])
    teaser_score = analyser.polarity_scores(news[2])
    title_score['source'] = news[0]
    teaser_score['source'] = news[0]
    title_score['type'] = "title"
    teaser_score['type'] = "teaser"
    scores.append(title_score)
    scores.append(teaser_score)
    
dataFrame = pd.DataFrame(scores)
print(dataFrame)

save_df(dataFrame, "U_News.csv")

     neg    neu    pos  compound         source    type
0  0.000  0.885  0.115    0.0772      Bloomberg   title
1  0.000  0.798  0.202    0.9260      Bloomberg  teaser
2  0.000  1.000  0.000    0.0000    MarketWatch   title
3  0.000  0.849  0.151    0.8126    MarketWatch  teaser
4  0.000  0.862  0.138    0.3400           CNBC   title
5  0.034  0.941  0.025   -0.1531           CNBC  teaser
6  0.000  0.885  0.115    0.0772  Yahoo Finance   title
7  0.024  0.870  0.106    0.8074  Yahoo Finance  teaser
8  0.000  1.000  0.000    0.0000  Seeking Alpha   title
9  0.000  0.808  0.192    0.9798  Seeking Alpha  teaser


# ZI INFO

In [49]:
TWEETS_PATH = path.join(REPO_PATH, "output/ZI/master.json")

#get the tweets from the json
with open(TWEETS_PATH) as f:
    palantir = json.load(f)
    

palantir_tweets = tweets_to_string(palantir) #array of related tweets
retweets = retweet_count(palantir)
favorites = favorite_count(palantir)

#print(palantir_tweets)

# calculate the sentiment for each tweet 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for tweet in palantir_tweets:
    score = analyser.polarity_scores(tweet)
    scores.append(score)
    
# print the tweet and the associated score
# i = 0
# for i in range(len(palantir_tweets)):
    # print(str(i+1) + ".json: "+ palantir_tweets[i] + ", score = " + json.dumps(scores[i]))

txtblob = []
for tweet in palantir_tweets:
    #print(TextBlob(tweet).sentiment.polarity)
    txtblob.append(TextBlob(tweet).sentiment.polarity)
    

    
vader = []
for tweet in palantir_tweets:
    tmp = sentiment_analyzer_scores(tweet) 
    vader.append(tmp)
    
#Converting List of Dictionaries into Dataframe
dataFrame = pd.DataFrame(scores)
dataFrame['vader'] = vader
dataFrame['txtblob'] = txtblob
dataFrame['retweets'] = retweets
dataFrame['favorites'] = favorites
print(dataFrame)
#dataFrame.mean()

dataFrame = add_scores(dataFrame, "ZI")
save_df(dataFrame, "ZI_Tweets.csv")

    neg    neu    pos  compound  vader  txtblob  retweets  favorites
0   0.0  1.000  0.000    0.0000      0     0.00         0          0
1   0.0  1.000  0.000    0.0000      0     0.00         0          0
2   0.0  0.913  0.087    0.2732      1     0.20         0          0
3   0.0  0.698  0.302    0.8591      1     0.45         0          0
4   0.0  0.667  0.333    0.8591      1     0.45         2          5
..  ...    ...    ...       ...    ...      ...       ...        ...
80  0.0  1.000  0.000    0.0000      0     0.00         0          0
81  0.0  0.913  0.087    0.2732      1     0.20         0          0
82  0.0  0.913  0.087    0.2732      1     0.20         0          0
83  0.0  1.000  0.000    0.0000      0     0.00         0          0
84  0.0  0.938  0.062    0.2003      1     0.00         0          0

[85 rows x 8 columns]


# ZI AFTER INFO

In [50]:
TWEETS_PATH = path.join(REPO_PATH, "output/ZI/after/after_master.json")

#get the tweets from the json
with open(TWEETS_PATH) as f:
    palantir = json.load(f)
    

palantir_tweets = tweets_to_string(palantir) #array of related tweets
retweets = retweet_count(palantir)
favorites = favorite_count(palantir)

#print(palantir_tweets)

# calculate the sentiment for each tweet 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for tweet in palantir_tweets:
    score = analyser.polarity_scores(tweet)
    scores.append(score)
    
# print the tweet and the associated score
# i = 0
# for i in range(len(palantir_tweets)):
    # print(str(i+1) + ".json: "+ palantir_tweets[i] + ", score = " + json.dumps(scores[i]))

txtblob = []
for tweet in palantir_tweets:
    #print(TextBlob(tweet).sentiment.polarity)
    txtblob.append(TextBlob(tweet).sentiment.polarity)
    
    
vader = []
for tweet in palantir_tweets:
    tmp = sentiment_analyzer_scores(tweet) 
    vader.append(tmp)
    
#Converting List of Dictionaries into Dataframe
dataFrame = pd.DataFrame(scores)
dataFrame['vader'] = vader
dataFrame['txtblob'] = txtblob
dataFrame['retweets'] = retweets
dataFrame['favorites'] = favorites
print(dataFrame)
#dataFrame.mean()

save_df(dataFrame, "ZI_after_Tweets.csv")

     neg    neu    pos  compound  vader  txtblob  retweets  favorites
0    0.0  0.913  0.087    0.2960      1     0.00         0          0
1    0.0  1.000  0.000    0.0000      0     0.00         0          0
2    0.0  0.857  0.143    0.5893      1     0.75         0          0
3    0.0  0.913  0.087    0.2960      1     0.00         0          0
4    0.0  1.000  0.000    0.0000      0     0.00         1          0
..   ...    ...    ...       ...    ...      ...       ...        ...
189  0.0  1.000  0.000    0.0000      0     0.00         0          0
190  0.0  0.913  0.087    0.2023      1     0.50         0          0
191  0.0  1.000  0.000    0.0000      0     0.00         1          0
192  0.0  0.886  0.114    0.4019      1    -0.05         0          1
193  0.0  1.000  0.000    0.0000      0     0.00         0          0

[194 rows x 8 columns]


# ZI NEWS INFO

In [51]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# news goes in order bloomberg, marketwatch, cnbc, yahoo finance, seeking alpha
# title, teaser text
zi_news = [["Bloomberg", "Yet Another Zoom Risks More Stock Confusion With ZoomInfo Debut", "Investors could face more name confusion this week when ZoomInfo Technologies Inc. joins Zoom Video Communications Inc. on the Nasdaq Stock Market. ZoomInfo, which provides data on sales prospects, is expected to price its initial public offering on June 3 and begin trading under the ticker symbol ZI the following day. It will join three other publicly traded companies globally whose names begin with Zoo, the most well-known of which is the maker of video-conferencing software that has become a household name during the coronavirus pandemic"],
                ["MarketWatch", "ZoomInfo raises expected pricing of IPO, boosts what it could raise to $890 million", "ZoomInfo Technologies ZI, +1.21% disclosed Tuesday that the expected pricing of its initial public offering of shares increased to between $19 and $20 a share, with the sales and marketing software company now set to raise up to $890 million through its offering of 44.5 million shares. Last week, the IPO was expected to price at $16 to $18 a share, with the company raising up to $801 million. "],
                ["CNBC", "ZoomInfo aiming to price IPO $1 above revised price range", "ZoomInfo is aiming to price its initial public offering $1 ahead of its already-buoyed price range, two sources familiar with the matter told CNBC. The company had already raised its IPO price range Tuesday morning, but now could target as high as $21 a share and rake in nearly $1 billion. ZoomInfo, which will trade under the symbol “ZI,” said in a government filing on Tuesday that it raised its IPO price range to between $19 and $20 per share amid strong demand, up from an initial estimate of between $16 and $18. The final decision on how to price its IPO has yet to be made."],
                ["Yahoo Finance", "ZoomInfo IPO: What You Need To Know", "ZoomInfo Technologies Inc. (NASDAQ: ZI) will issue 44.5 million shares on the Nasdaq under ticker 'ZI,' according to the firm’s S-1 filing. Priced between $16 and $18, the offering represents 98.1% of outstanding shares and is expected to bring in about $921.2 million. On Thursday morning, the IPO was priced at $21 per share. The lead underwriters include JPMorgan and Morgan Stanley. The company qualifies as an emerging growth company under the U.S. JOBS Act, which exempts management from certain SEC disclosure requirements."],
                ["Seeking Alpha", "ZoomInfo raises price range for IPO", "ZoomInfo Technologies (ZI) now expects to price its initial public offering of 44.5M shares at between $19-$20 per share, from $16-$18 previously. JPMorgan Chase and Morgan Stanley are leading the sale."]]

# calculate the sentiment for each news source title, teaser 
scores = [] # array that stores the sentiment value (neg, neu, pos, compound) 
            # where compound is the aggregate sentiment
analyser = SentimentIntensityAnalyzer()
for news in zi_news:
    title_score = analyser.polarity_scores(news[1])
    teaser_score = analyser.polarity_scores(news[2])
    title_score['source'] = news[0]
    teaser_score['source'] = news[0]
    title_score['type'] = "title"
    teaser_score['type'] = "teaser"
    scores.append(title_score)
    scores.append(teaser_score)
    
dataFrame = pd.DataFrame(scores)
print(dataFrame)

save_df(dataFrame, "ZI_News.csv")

     neg    neu    pos  compound         source    type
0  0.364  0.636  0.000   -0.5542      Bloomberg   title
1  0.028  0.923  0.049    0.2315      Bloomberg  teaser
2  0.000  0.850  0.150    0.3182    MarketWatch   title
3  0.000  0.854  0.146    0.8360    MarketWatch  teaser
4  0.000  1.000  0.000    0.0000           CNBC   title
5  0.015  0.888  0.097    0.8537           CNBC  teaser
6  0.000  1.000  0.000    0.0000  Yahoo Finance   title
7  0.000  0.834  0.166    0.9231  Yahoo Finance  teaser
8  0.000  1.000  0.000    0.0000  Seeking Alpha   title
9  0.000  0.868  0.132    0.5267  Seeking Alpha  teaser
