# Convert tweet id into twitter text

In [1]:
import tweepy
import pandas as pd
import time
import numpy as np

In [2]:
# Sets up Twitter API Access

consumer_key = "########################"
consumer_secret = "########################"
access_token = "########################"
access_token_secret = "########################"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)

#Example of tweet id:
#tweetFetched = api.get_status(1228393702244134912)
#print(tweetFetched.text)

# Import Twitter ID JSON File

In [3]:
json_file = pd.read_json('../NLP_MBTI_Classification/twisty/TwiSty-ES.json').T
json_file

Unnamed: 0,other_tweet_ids,mbti,user_id,confirmed_tweet_ids,gender
54056853,"[566773199212666880, 585051894512545793, 58445...",ENTJ,54056853,"[676146060193079296, 446745936958865408, 57718...",M
2573243225,"[648930148654415872, 664987246764924928, 64612...",INFP,2573243225,"[646509854648049664, 642744838832685057, 64889...",F
399751937,"[631138007282749441, 634229567264301056, 64943...",INTP,399751937,"[672629376589111297, 683482513252155392, 66337...",M
541982091,"[659517076869144576, 664204395563847680, 66485...",INFJ,541982091,"[672139144819249152, 660784321603706880, 67571...",F
70392126,"[483226984043253760, 471686257534595073, 62794...",ISTJ,70392126,"[667361613314437120, 590695776218644485, 49250...",F
...,...,...,...,...,...
165848831,"[150328353260441601, 530779540457791489, 16663...",ENFP,165848831,"[110441331024863232, 405190077535354881, 12593...",F
128377140,"[305742251139661824, 305740357050703872, 30574...",INFJ,128377140,"[311823910645678080, 84439538260393984, 122385...",M
122450877,"[9631407336853504, 24025150416, 24128451353, 5...",ESFJ,122450877,"[132599939426553856, 25242217895436288, 669667...",F
248404858,"[431186916822638593, 456537307504336896, 56778...",ENTJ,248404858,"[470612847370993665, 431954224692215808, 43159...",M


The index is the same as the user id. The confirmed_tweet_ids are the tweet ids that are confirmed to be in the particular language.

In [4]:
json_file = json_file.sample(n=5000, replace=False)

In [5]:
json_file = json_file[['confirmed_tweet_ids', 'user_id', 'mbti']].explode('confirmed_tweet_ids').reset_index(drop=True)
json_file

Unnamed: 0,confirmed_tweet_ids,user_id,mbti
0,580186352580427776,844084939,ENFP
1,569863966698184704,844084939,ENFP
2,571028386086293506,844084939,ENFP
3,598202090041778176,844084939,ENFP
4,585811351584538624,844084939,ENFP
...,...,...,...
6320331,155527057932173312,65861809,ENFP
6320332,27492784667,65861809,ENFP
6320333,21944163882,65861809,ENFP
6320334,32601289288318977,65861809,ENFP


In [6]:
sampled_tweets_already = pd.read_csv('../NLP_MBTI_Classification/ES-trial-text.csv', index_col=0)['confirmed_tweet_ids']

In [7]:
# Sample a fraction of the rows for training/test set
sample_number = 13958
sample_json_file = json_file.sample(n=sample_number, replace=False)
sample_json_file

Unnamed: 0,confirmed_tweet_ids,user_id,mbti
1621147,679302623246315520,2376948354,INTJ
5267654,352825662832263169,112320072,ESFP
2154928,560846480773484544,1972597249,INTP
4159777,168914863958142976,169351417,ENFP
4450493,444967689463664640,98169504,ENFP
...,...,...,...
4755063,278932972399390720,129071106,ENFJ
3490311,679292513249837056,300265697,INFP
1688797,228489882379644928,71076748,ESTJ
3912837,359511806433173504,49523341,ENTJ


In [8]:
total = 0
for i in list(sample_json_file.confirmed_tweet_ids):
    if i in sampled_tweets_already:
        total += 1

print(total)

0


# Twitter API

Twitter API fetches around 2,000 tweets an hour.

Get 25,000 tweets total. 80 20 train test split leads to 20,000 training and 5,000 test data

In [9]:
print("Number of Hours to Fetch Tweets:", sample_number/2000)

Number of Hours to Fetch Tweets: 6.979


In [10]:
sleepTime = 2
tweet_ids_and_tweet_text = []

In [11]:
# Tries to find the tweet id in the twitter database
for twitter_id in sample_json_file['confirmed_tweet_ids']:
    try:
        tweetFetched = api.get_status(twitter_id)
        tweet_ids_and_tweet_text.append([twitter_id, tweetFetched.text])
        time.sleep(sleepTime)
    except:
        continue
        

In [12]:
twitter_text = pd.DataFrame(tweet_ids_and_tweet_text, columns=['confirmed_tweet_ids', 'twitter_text'])

In [14]:
# Joins the text to the associated id if the twitter text could be found
# twitter_text is NaN if the tweet can't be found
import_ids = pd.merge(sample_json_file, 
                      twitter_text, 
                      on="confirmed_tweet_ids", 
                      sort=False,
                      how='outer',
                      validate="one_to_one")

import_ids

Unnamed: 0,confirmed_tweet_ids,user_id,mbti,twitter_text
0,679302623246315520,2376948354,INTJ,
1,352825662832263169,112320072,ESFP,@PableteSeverson 1000 por lo bajo
2,560846480773484544,1972597249,INTP,@xulia_98 Más bien hambre.
3,168914863958142976,169351417,ENFP,"@djiked te los perdiste, ya terminaron…Estaban..."
4,444967689463664640,98169504,ENFP,
...,...,...,...,...
13953,278932972399390720,129071106,ENFJ,@RoberthLeon ajajaja upss se me salio !! De es...
13954,679292513249837056,300265697,INFP,@Ech_marina @Cristian_Croft me meo porque me r...
13955,228489882379644928,71076748,ESTJ,
13956,359511806433173504,49523341,ENTJ,"Cuando piensas que nada puede ir peor, te das ..."


In [15]:
previous_df = pd.read_csv('../NLP_MBTI_Classification/ES-trial-text.csv', index_col=0)
previous_df.shape

(20002, 4)

In [16]:
import_ids = pd.concat([previous_df, import_ids], axis=0)
import_ids

Unnamed: 0,confirmed_tweet_ids,user_id,mbti,twitter_text
0,3.39781e+17,3.07518e+07,ESTJ,
1,3.827e+17,9.17325e+07,ESTJ,El marico cualquiera cree que es la persona qu...
2,4.33681e+17,1.00253e+08,ISFJ,Una vaina es manifestar pacíficamente y otrs d...
3,6.8238e+17,1.90371e+08,ENTJ,
4,6.66068e+17,4.58613e+08,ENTP,@SusanaStraus bueno todavía queda la adc de la...
...,...,...,...,...
13953,278932972399390720,129071106,ENFJ,@RoberthLeon ajajaja upss se me salio !! De es...
13954,679292513249837056,300265697,INFP,@Ech_marina @Cristian_Croft me meo porque me r...
13955,228489882379644928,71076748,ESTJ,
13956,359511806433173504,49523341,ENTJ,"Cuando piensas que nada puede ir peor, te das ..."


In [17]:
import_ids.to_csv('../NLP_MBTI_Classification/ES-trial-text.csv')