In [1]:
import pandas as pd
import numpy as np
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

In [3]:
nlp = spacy.load('en_core_web_sm')
spacy_text_blob = SpacyTextBlob()
nlp.add_pipe(spacy_text_blob)

In [14]:
text = "I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy."

In [15]:
doc = nlp(text)
print('Polarity:', doc._.sentiment.polarity)
print('Sujectivity:', doc._.sentiment.subjectivity)
print('Assessments:', doc._.sentiment.assessments)


Polarity: -0.125
Sujectivity: 0.9
Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]


In [16]:
text = ["This is the best time of my life, ever and ever and forever.",
        "I had a really horrible day. It was the worst day ever!",
        "But every now and then I have a really good day that makes me happy."]

In [17]:
nlp = spacy.load('en_core_web_sm')
spacy_text_blob = SpacyTextBlob()
nlp.add_pipe(spacy_text_blob)

In [18]:
result = [[nlp(tw)._.sentiment.polarity, nlp(tw)._.sentiment.subjectivity] for tw in text]
result = pd.DataFrame(result).rename(columns={0:'polarity', 1:'subjectivity'})
result

Unnamed: 0,polarity,subjectivity
0,1.0,0.3
1,-1.0,1.0
2,0.75,0.8


In [36]:
# Parameters
my_folder = "../tweets/"

# Open `json` file with all tweets
for i in range(0,16):
    try:
        df = pd.read_json(my_folder + 'tweets_sentiment_' + str(i) + '.json')
        print("batch ", str(i), ": ", len(df))
    except:
        break

batch  0 :  300000
batch  1 :  300000
batch  2 :  300000
batch  3 :  300000
batch  4 :  300000
batch  5 :  300000
batch  6 :  300000
batch  7 :  300000
batch  8 :  300000
batch  9 :  215145


In [17]:
300000*9+215145

2915145

In [45]:
aux = df[["account", "tweet"]].rename(columns={"account":"account_2", "tweet":"tweet_2"})

result = pd.concat([df, aux], axis=1)

result

Unnamed: 0,account,date,replyCount,retweetCount,likeCount,quoteCount,lang,sourceLabel,username,followersCount,friendsCount,location,tweet,day,announcement,polarity,subjectivity,account_2,tweet_2
0,JustinTrudeau,2020-04-06 16:21:04,0,0,0,0,en,Twitter for iPhone,avantiverdi,175,685,"Greater Vancouver, British Columbia",federal covid-19 benefits program portal opens...,2020-04-06,True,0.000000,0.000000,JustinTrudeau,federal covid-19 benefits program portal opens...
1,JustinTrudeau,2020-04-06 16:21:02,0,6,32,1,en,Twitter Web App,gary_feltham,2825,882,"Calgary, Alberta",@swiftie01 @justintrudeau there have been so m...,2020-04-06,True,0.500000,0.500000,JustinTrudeau,@swiftie01 @justintrudeau there have been so m...
2,JustinTrudeau,2020-04-06 16:20:59,0,0,1,0,en,Twitter Web App,GJacques2008,1489,548,,@thdarkjedi_ @justintrudeau he 's waiting for ...,2020-04-06,True,0.000000,0.000000,JustinTrudeau,@thdarkjedi_ @justintrudeau he 's waiting for ...
3,JustinTrudeau,2020-04-06 16:20:57,0,0,1,0,en,Twitter Web App,Chad14513688,28,394,,@liviofilice @justintrudeau @realdonaldtrump i...,2020-04-06,True,-0.250000,0.600000,JustinTrudeau,@liviofilice @justintrudeau @realdonaldtrump i...
4,JustinTrudeau,2020-04-06 16:20:54,0,0,0,0,en,Twitter for Android,MRobs68,5,8,,"@justintrudeau thanks trudy, i qualify for not...",2020-04-06,True,0.200000,0.200000,JustinTrudeau,"@justintrudeau thanks trudy, i qualify for not..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215140,JustinTrudeau,2020-04-30 00:00:44,0,0,0,0,en,Twitter for iPhone,MLK20182,654,241,"Ontario, Canada",investigate justin trudeau for financial crime...,2020-04-30,True,0.000000,0.000000,JustinTrudeau,investigate justin trudeau for financial crime...
215141,JustinTrudeau,2020-04-30 00:00:33,0,0,0,0,en,Twitter for iPhone,rfguerrini,1007,2657,The World üáÆüáπ üá®üá¶ üá∫üá∏ üöì,@northerndrums @kingofdaddys @justintrudeau th...,2020-04-30,True,0.366667,0.662963,JustinTrudeau,@northerndrums @kingofdaddys @justintrudeau th...
215142,JustinTrudeau,2020-04-30 00:00:27,1,0,2,0,en,Twitter for iPhone,SRodrigues52,219,412,"London, On",@shelbzzzzy @justintrudeau quit your job üßê,2020-04-30,True,0.000000,0.000000,JustinTrudeau,@shelbzzzzy @justintrudeau quit your job üßê
215143,JustinTrudeau,2020-04-30 00:00:19,1,0,0,0,en,Twitter for iPhone,pffoote,356,1531,"St. John's, NL, Canada",@rcaf_arc @cfsnowbirds @canadianforces @cfoper...,2020-04-30,True,0.392857,0.517857,JustinTrudeau,@rcaf_arc @cfsnowbirds @canadianforces @cfoper...


In [2]:
# Parameters
my_folder = "../tweets/"
df = pd.read_json(my_folder + 'tweets_db_clean.json')

In [3]:
print("Variables: add date, announcement, and select only english tweets.") 
df['day'] = [df['date'][i].strftime("%Y-%m-%d") for i in range(len(df))] # add day
df['announcement'] = df['day'] >= "2020-03-27" # boolean variable of the Accouncement
df2 = df[df['lang']=='en'].reset_index(drop=True) # select only tweets in English

Variables: add date, announcement, and select only english tweets.


In [4]:
my_size = len(df)
print(len(df))
print(len(df2))

3503875
2915145


In [5]:
my_size

3503875

In [6]:
i = 0
n = 300_000

In [7]:
for i in range(int(my_size / n) + 1):
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
