In [81]:
# Mounting drive

from google.colab import drive

drive.mount('/content/drive')

import os
os.chdir("/content/drive/Shareddrives/InformationRetrieval_2022/Lab1-Modeling_and_Indexing/project/inputs")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import libraries

In [82]:
import json
import csv
import pandas as pd
import numpy as np
import re
import datetime

In [83]:
import nltk
nltk.download('stopwords')

from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Function definitions

In [84]:
def clean(text):
    # transform to lowercase
    cleanText = text.lower()

    # removing the urls from tweeet, starts with https
    cleanText = re.sub('https?:\/\/.*[\r\n]*', '', cleanText, flags=re.MULTILINE)

    # removing nonalphanumeric
    cleanText = re.sub(r'[\W]+', ' ', cleanText)
    cleanText = re.sub(r'[\_]+', '', cleanText)
    
    return cleanText
 

In [85]:
def build_terms(text):

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))

    text = clean(text)

    # Tokenize the text to get a list of terms
    text =  text.split()

    # Eliminate the stopwords 
    text = [word for word in text if word not in stop_words]  

    # Stemming (keeping the root of the words)
    text = [stemmer.stem(word) for word in text] 
    
    return text

In [86]:
def create_mapping(filename, key, value, verbose=True):
  
  # Read the csv file
  mapping_df = pd.read_csv(filename, header=None, sep='\s', index_col=False)

  # Change column names 
  mapping_df.columns = [key, value]

  # Transform the pandas DataFrame to a dictionary {doc_id: tweet_id}
  mapping = dict()

  for idx, row in mapping_df.iterrows():
    mapping[row[key]] = row[value]

  if verbose:
    print('The length of the mapping is: ', len(mapping_df))
    print('Example of structure: ({} : {})'.format(list(mapping.keys())[0], list(mapping.values())[0]))

  return mapping

# Solution implementation

In [87]:
# Build a dictionary {doc_id: tweet_id}

doc_to_tweet = create_mapping('tweet_document_ids_map.csv', key='doc_id', value='tweet_id')

The length of the mapping is:  4000
Example of structure: (doc_1 : 1575918182698979328)


In [88]:
# Read data from .json

data = pd.read_json('tw_hurricane_data.json', lines = True)

In [89]:
# Checking that all tweets are in english

data['lang'].value_counts()

en    4000
Name: lang, dtype: int64

In [90]:
# Analyze original data

data.head()

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,extended_entities,metadata,source,...,is_quote_status,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,lang,quoted_status_id,quoted_status_id_str,quoted_status
0,2022-09-30 18:39:08+00:00,1575918182698979328,1575918182698979328,So this will keep spinning over us until 7 pm…...,False,"[0, 76]","{'hashtags': [{'text': 'HurricaneIan', 'indice...","{'media': [{'id': 1575918178261254162, 'id_str...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",...,False,0,0,False,False,0.0,en,,,
1,2022-09-30 18:39:01+00:00,1575918151862304768,1575918151862304768,Our hearts go out to all those affected by #Hu...,False,"[0, 136]","{'hashtags': [{'text': 'HurricaneIan', 'indice...",,"{'iso_language_code': 'en', 'result_type': 're...","<a href=""https://sproutsocial.com"" rel=""nofoll...",...,False,0,0,False,False,,en,,,
2,2022-09-30 18:38:58+00:00,1575918140839673873,1575918140839673856,Kissimmee neighborhood off of Michigan Ave. \n...,False,"[0, 58]","{'hashtags': [{'text': 'HurricaneIan', 'indice...","{'media': [{'id': 1575918121080311808, 'id_str...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",...,False,0,0,False,False,0.0,en,,,
3,2022-09-30 18:38:57+00:00,1575918135009738752,1575918135009738752,I have this one tree in my backyard that scare...,False,"[0, 141]","{'hashtags': [{'text': 'scwx', 'indices': [122...",,"{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",...,False,0,0,False,False,,en,,,
4,2022-09-30 18:38:53+00:00,1575918119251419136,1575918119251419136,@AshleyRuizWx @Stephan89441722 @lilmizzheidi @...,False,"[127, 280]","{'hashtags': [{'text': 'HurricaneIan', 'indice...",,"{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",...,False,0,0,False,False,,en,,,


### Creating a dictionary of tweets

In [91]:
# Init dictionary
tweets = dict()


for idx, row in data.iterrows():

  # Init dictionary for each tweet
  tweet = dict()
    
  tweet['text'] = build_terms(row['full_text'])   # Clean and format text

  tweet['username'] = row['user']['screen_name']

  tweet['date'] =  row['created_at'].strftime('%d/%m/%Y %H:%M:%S')  # Format date

  tweet['hashtags'] = [h['text'] for h in row['entities']['hashtags']]  # Store a list of hashtags

  tweet['likes'] = row['favorite_count']

  tweet['retweets'] = row['retweet_count']

  tweet['url'] = 'https://twitter.com/'+row['user']['screen_name']+'/status/'+str(row['id'])  # Construct the url

  # Add tweet to the dictionary
  tweets[row['id']] = tweet


## Checks

In [92]:
print('The document doc_id: {} has tweet_id: {}\n'.format('doc_1', doc_to_tweet['doc_1']))
print('In the intial DataFrame we have tweet_id {} at position 0\n'.format(data.iloc[0]['id']))
print('The original full_text is: [{}]'.format(data.iloc[0]['full_text']))

The document doc_id: doc_1 has tweet_id: 1575918182698979328

In the intial DataFrame we have tweet_id 1575918182698979328 at position 0

The original full_text is: [So this will keep spinning over us until 7 pm…go away already. #HurricaneIan https://t.co/VROTxNS9rz]


In [93]:
# Tweet after formatting

tweets[1575918140839673873]

{'text': ['kissimme', 'neighborhood', 'michigan', 'ave', 'hurricaneian'],
 'username': 'CHeathWFTV',
 'date': '30/09/2022 18:38:58',
 'hashtags': ['HurricaneIan'],
 'likes': 0,
 'retweets': 0,
 'url': 'https://twitter.com/CHeathWFTV/status/1575918140839673873'}