# Emerging Topic Detection

## Introduction

The purpose of the study is to develop a NLP model to help authorities to deal with crises. The idea is to select dataset containing emerging topic. Considering those requirements, it has been decided to work on #COVID19 Twitter datasets.
The next step is to choose a dataset that meet the NLP-model requirements among tons of dataset available on the internet.

## 1 - Dataset selection

### 11 - CODA-19

A study was conducted in April by scientists creating a 10,000 tweets dataset specifically to train NLP model. The study is available on this link: https://arxiv.org/abs/2005.02367

In [1]:
import csv
import pandas as pd

data = pd.read_csv("/home/urendil/Documents/01-ENSTA/Cours/PRE/COVID/CODA-19/data/CODA19_v1_20200504/human_label/coda_metadata.csv",encoding='unicode_escape')
data.head()

Unnamed: 0,coda_paper_id,paper_id,coda_data_split,subset,paragraph_num,sentence_num,segment_num,token_num,coda_has_expert_labels,title
0,1,006be5ba67759a525ae8e211f43bd8e4429a64f0,test,custom_license,1,9,16,235,False,Human astroviruses: in silico analysis of the ...
1,2,6a0da82f10bac49659556d074a71136e794ae45d,test,custom_license,2,8,13,204,True,Public health in practice: the three domains o...
2,3,1b822763c007bb77798727490e95ee2bec342494,test,custom_license,1,10,15,322,False,Preparedness of institutions around the world ...
3,4,97ebcdaaa4b9dd8f174eb3aa4137231b5af413fb,test,custom_license,1,3,4,73,False,Effect of the Ebola-virus-disease epidemic on ...
4,5,83303635687128ac583152565ba1d7e29540f2af,test,noncomm_use_subset,2,6,10,213,False,Health care-associated infections -an overview


In [2]:
# Classic useful libraries

import os
import csv
import json
import datetime
import time
from pprint import pprint
import numpy as np
from pathlib import Path
import pandas as pd

# Particular library from Nucleus for this notebook
import nucleus_api
from nucleus_api.rest import ApiException
import nucleus_api.api.nucleus_api as nucleus_helper
    
configuration = nucleus_api.Configuration()
configuration.host = 'nucleus.sumup.ai:5000'
configuration.api_key['x-api-key'] = 'zGtJTrTa4izSMMdssWpOeg'

# Create API instance
api_instance = nucleus_api.NucleusApi(nucleus_api.ApiClient(configuration))

Check the connection and dataset available on the API

In [3]:
try:
    api_response = api_instance.get_list_datasets()
except ApiException as e:
    print("Exception when calling DatasetsApi->get_list_datasets: %s\n" % e)

list_datasets = api_response.result

print(len(list_datasets), 'datasets in the database:')
for ds in list_datasets:
    print('    ', ds.name)

5 datasets in the database:
     CODA19
     COVID19_geolocation
     Drouet_letters_cleaned
     ASRS3
     ASRS1


### Preprocess the dataset: 'time','title' and 'content'

In [4]:
with open('./CODA-19/data/CODA19_v1_20200504/human_label/coda_metadata.csv','r',encoding='UTF8',errors='ignore') as r:
    reader = csv.DictReader(r)
    # Creating an output file to write the preprocessed dataset in
    with open('./CODA-19/data/CODA19_v1_20200504/human_label/20200525_coda19.csv','w') as w: 
        fieldnames=['time','title','content']
        writer = csv.DictWriter(w,fieldnames=fieldnames)
        writer.writeheader()
        # For each line in the input file, write in the output file specifically the row 'coda_paper_id','subset' and 'title' as 'time', 'title' and 'content' 
        for row in reader:
            line=[row['coda_paper_id'],row['subset'],row['title']]
            writer.writerow({'time':line[0],'title':line[1],'content':line[2]})

In [5]:
csv_file = './CODA-19/data/CODA19_v1_20200504/human_label/20200525_coda19.csv'
dataset = 'CODA19'
with open(csv_file,'r',encoding='UTF8',errors='ignore') as r:
    reader = csv.DictReader(r)
    for row in reader:
        json_props = nucleus_helper.upload_jsons(api_instance, dataset, reader, processes=1)
        #json_props = nucleus_api.Appendjsonparams(dataset=dataset, language='english', document={ 'time':row['time'], 'title':row['title'], 'content':row['content']})
        #api_response = api_instance.post_append_json_to_dataset(json_props)
    total_size = 0
    total_jsons = 0
    for jp in json_props:
        total_size += jp.size
        total_jsons += 1
    print(total_jsons,'JSON records(', total_size, 'bytes) appended to', dataset)

INFO: Start polling job status of 2722780




INFO: Job 2722780 completed.
1 JSON records( 28090368 bytes) appended to CODA19


### Topics analysis

In [6]:
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
query = ''
#custom_stop_words = ["real","hillary"] # str | List of stop words. (optional)
custom_stop_words=''
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
metadata_selection = "" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
time_period = ""     # str | Time period selection. Choices: ["1M","3M","6M","12M","3Y","5Y",""] (optional)
S=[]
K=[]
W=[]

try:
    payload = nucleus_api.Topics(dataset=dataset,                                
                                query=query,                   
                                custom_stop_words=custom_stop_words,     
                                num_topics=num_topics,
                                metadata_selection=metadata_selection,
                                time_period=time_period)
    api_response = api_instance.post_topic_api(payload)        
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

doc_ids = api_response.result.doc_ids
topics = api_response.result.topics

key = 0

for i, res in enumerate(topics):
    print('Topic', i, 'keywords:')
    print('    Keywords:', res.keywords)
    keywords_weight_str = ";".join(str(x) for x in res.keywords_weight)
    print('    Keyword weights:', keywords_weight_str)
    print('    Strength:', res.strength)
    
    doc_topic_exposure_sel = []  # list of non-zero doc_topic_exposure
    doc_id_sel = []        # list of doc ids matching doc_topic_exposure_sel
    for j in range(len(res.doc_topic_exposures)):
        doc_topic_exp = float(res.doc_topic_exposures[j])
        if doc_topic_exp != 0:
            doc_topic_exposure_sel.append(doc_topic_exp)
            doc_id_sel.append(doc_ids[j])
    doc_id_sel_str = ' '.join(str(x) for x in doc_id_sel)
    doc_topic_exposure_sel_str = ' '.join(str(x) for x in doc_topic_exposure_sel)
    #print('    Document IDs:', doc_id_sel_str)
    #print('    Document exposures:', doc_topic_exposure_sel_str)
    print('---------------')
    s.append(res.strength)
    k.append(res.keywords)
    w.append(keywords_weight_str)
    
S.append(s)
K.append(k)
W.append(w)


Topic 0 keywords:
    Keywords: respiratory syndrome;acute respiratory;middle east;east respiratory;severe acute;syndrome coronavirus;syndrome outbreak;management middle
    Keyword weights: 0.1297;0.15;0.2679;0.1192;0.1346;0.1329;0.0379;0.0278
    Strength: 0.2518
---------------


NameError: name 's' is not defined

We create csv files to store the keywords, strenghts and weights from the API results.
We may need it later.

In [None]:
m=len(K)
df_keywords = pd.DataFrame({}, index = [k for k in range (1, len(K[0]))])
for i in range (m):
    df_keywords[K[i][0]] = [K[i][j] for j in range (1, len(K[i]))]
df_keywords.to_csv('keywords.csv', sep=';', header=True)

n=len(S)
df_strength = pd.DataFrame({}, index = [k for k in range (1, len(S[0]))])
for i in range (n):
    df_strength[S[i][0]] = [S[i][j] for j in range (1, len(S[i]))]
df_strength.to_csv('strength.csv', sep=';', header=True)

p=len(W)
df_strength = pd.DataFrame({}, index = [k for k in range (1, len(W[0]))])
for i in range (p):
    df_strength[W[i][0]] = [W[i][j] for j in range (1, len(W[i]))]
df_strength.to_csv('weights.csv', sep=';', header=True)

In [None]:
def store_list(L):
    MyFile=open('Keywords.txt','w')
    n=len(L)
    for i in range (n):
        T=L[i]
        p=len(T)
        for j in range(p):
            MyFile.write(T[j]+'\n')
    MyFile.close()

In [None]:
store_list(S)
store_list(K)

## Bag of words

The content is stored in a file that will be uploaded into the process.

In [None]:
with open('./CODA-19/data/CODA19_v1_20200504/human_label/coda_metadata.csv','r',encoding='UTF8',errors='ignore') as r:
    reader = csv.DictReader(r)
    with open('coda19.txt','w') as w: 
        fieldnames=['content']
        writer = csv.DictWriter(w,fieldnames=fieldnames)
        writer.writeheader()
        # For each line in the input file, write in the output file specifically the row 'coda_paper_id','subset' and 'title' as 'time', 'title' and 'content' 
        for row in reader:
            line=[row['title']]
            writer.writerow({'content':line[0]})         

Nevertheless, the content is required to be computed to chop out/off single words or redundancy.

After that, the words from the content will be vectorized.

In [None]:
t=open('coda19.txt','r')
document=t.readlines()

# api_response.result
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
#document = document.split()

document = [stemmer.lemmatize(word) for word in document]
document = ' '.join(document)

documents.append(document)

stemmer = WordNetLemmatizer()

from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(document).toarray()

###  $\textit{Relevance of the dataset}$

How is the choice of the dataset relevent? Is a different dataset changed the output of the model? How close? Distance?

### 12 - COVID19 TweetsID


I tried to download the following dataset: https://github.com/echen102/COVID-19-TweetIDs. 

In [None]:
import json
import pandas as pd

data1 = pd.read_json("/home/urendil/Documents/01-ENSTA/Cours/PRE/COVID/COVID-19-TweetIDs/2020-01/coronavirus-tweet-id-2020-01-21-23.jsonl",encoding='unicode_escape')
data1.head()

### 13 - Kaggle geolocation COVID

In [None]:
# import the necessary libraries
import numpy as np 
import pandas as pd 

# Visualisation libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
import pycountry
py.init_notebook_mode(connected=True)
import folium 
from folium import plugins
from folium.plugins import MarkerCluster


# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

# Increase the default plot size and set the color scheme
plt.rcParams['figure.figsize'] = 8, 5
#plt.rcParams['image.cmap'] = 'viridis'


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Disable warnings 
import warnings
warnings.filterwarnings('ignore')

# Reading the dataset
data = pd.read_csv("Kaggle/COVID-19_geo_timeseries_ver_0311.csv")
data.head()

In [None]:
data = data[data.data_source=='jhu']
# Convert Last Update column to datetime64 format
data['update_time'] = pd.to_datetime(data['update_time'])
print(data['update_time'].dtype)
# Extract date from the timestamp
data['update_date'] = data['update_time'].dt.date

In [None]:
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from fbprophet import Prophet
import pycountry
import plotly.express as px

# Import data from John Hopkins University
df = data[data.data_source=='jhu']
df_agg = df.groupby('update_date').agg({'confirmed_cases':'sum','deaths':'sum','recovered':'sum'}).reset_index()
fig = go.Figure()
fig.add_trace(go.Bar(x=df_agg['update_date'],
                y=df_agg['confirmed_cases'],
                name='Confirmed',
                marker_color='blue'
                ))
fig.add_trace(go.Bar(x=df_agg['update_date'],
                y=df_agg['deaths'],
                name='Deaths',
                marker_color='Red'
                ))
fig.add_trace(go.Bar(x=df_agg['update_date'],
                y=df_agg['recovered'],
                name='Recovered',
                marker_color='Green'
                ))

fig.update_layout(
    title='Worldwide Corona Virus Cases - Confirmed, Deaths, Recovered',
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Number of Cases',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()

In [None]:
# Countries affected
countries = data[(data['country']!='Others') & (data['country']!='Undisclosed')]['country'].unique().tolist()
# Use this print trick to get more readable list output
print(*countries, sep = "\n")
print("\nTotal countries affected by COVID-19: ",len(countries))

In [None]:
# get the latest timestamp
latest_date = data['update_time'].max()
# extract year, month, day from the latest timestamp so we can use it just report the latest data
year = latest_date.year
month = latest_date.month
# adjust for timezone
day = latest_date.day - 1

# Filter to only include the latest day data
from datetime import date
data_latest = data[data['update_time'] > pd.Timestamp(date(year,month,day))]
data_latest.head()

In [None]:
# Creating a dataframe with total no of confirmed cases for every country as of the latest available date
affected_country_latest = data_latest.groupby(['country','country_code','region','latitude','longitude','country_flag']).agg({'update_time': np.max}).reset_index()
key = ['country','country_code','region','latitude','longitude','country_flag','update_time']
global_cases = pd.merge(data_latest, affected_country_latest, how='inner', on=key).drop_duplicates().groupby(key).max().sort_values(by=['confirmed_cases'],ascending=False).reset_index()
global_cases.index+=1
global_cases_columns = global_cases.columns.tolist()
global_cases_columns.remove('update_time')
global_cases = global_cases[global_cases_columns]
global_cases

In [None]:
shape_url = './Kaggle/world-countries.json'
world_geo = shape_url

m = folium.Map(location=[35.86166,104.195397], zoom_start=3,tiles='Stamen Toner')

folium.Choropleth(
    geo_data=world_geo,
    name='choropleth',
    data=global_cases,
    columns=['country', 'confirmed_cases'],
    key_on='feature.properties.name',
    fill_color='OrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Number of Confirmed Cases'
).add_to(m)

for lat, lon, value, name in zip(global_cases['latitude'], global_cases['longitude'], global_cases['confirmed_cases'], global_cases['country']):
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup = ('<strong>Country</strong>: ' + str(name).capitalize() + '<br>'
                 '<strong>Confirmed Cases</strong>: ' + str(value) + '<br>'),        
        color='orange',
        fill=True,
        fill_color='orange',
        fill_opacity=0.7
    ).add_to(m)

folium.LayerControl().add_to(m)

m

In [None]:
# Creating a dataframe with total no of confirmed cases for every country for all available dates
key1 = ['country','country_code','region','latitude','longitude','country_flag','update_date']
key2 = ['country','country_code','region','latitude','longitude','country_flag','update_date','confirmed_cases','deaths','recovered']
key3 = ['country','country_code','region','latitude','longitude','country_flag']
df_full = data[data.data_source == 'jhu'][key2].drop_duplicates().groupby(key1).max().reset_index()
# df_full = data[key2].drop_duplicates().groupby(key1).max().sort_values(by=['country','update_date']).groupby(key3).cumsum().sort_values(by=['confirmed_cases','update_date'],ascending=[False,False]).reset_index()
# df_full = df_full.groupby(key1).agg({'confirmed_cases':np.cumsum, 'deaths':np.cumsum ,'recovered':np.cumsum}).reset_index()
df_full[['confirmed_cases','deaths','recovered']] = df_full[['confirmed_cases','deaths','recovered']].fillna(0)
df_full['log_confirmed_cases'] = np.log(df_full['confirmed_cases'])
df_full.sort_values(by=['confirmed_cases','update_date'],ascending=[False,False]).head(10)

In [None]:
import plotly
import plotly.graph_objs as go
from datetime import datetime
from datetime import timedelta

scl = [[0.0, '#e7e1ef'],[0.2, '#d4b9da'],[0.4, '#c994c7'], 
       [0.6, '#df65b0'],[0.8, '#dd1c77'],[1.0, '#980043']] # reds

data_slider = []
all_dates = df_full['update_date'].sort_values().unique()
for m,d in zip(pd.DatetimeIndex(all_dates).month,pd.DatetimeIndex(all_dates).day):
    df_selected = df_full[(pd.DatetimeIndex(df_full['update_date']).month==m) & (pd.DatetimeIndex(df_full['update_date']).day==d)]
    df_selected['text'] =   'Date: '+ df_selected['update_date'].astype(str) \
                            + '<br>' + 'Confirmed Cases: ' + df_selected['confirmed_cases'].astype(str) \
                            + '<br>' + 'Deaths: '+ df_selected['deaths'].astype(str) \
                            + '<br>' + 'Recovered: '+ df_selected['recovered'].astype(str)
    data_one_day = dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale=False,
        locations = df_selected['country'].tolist(),
        z = df_selected['log_confirmed_cases'].tolist(),
        locationmode = 'country names',
        text = df_selected['text'],
        colorbar_title = 'Confirmed Cases (Logarithm)'
    )
    data_slider.append(data_one_day)

steps = []
for i in range(len(data_slider)):
    step = dict(method='restyle',
                args=['visible', [False] * len(data_slider)],
                label=(datetime.strptime('2020-01-21','%Y-%m-%d') + timedelta(days=i)).strftime('%Y-%m-%d')
               )
    step['args'][1][i] = True
    steps.append(step)

sliders = [dict(active=0, pad={"t": 1}, steps=steps)]  

lyt = dict(
    geo=dict(scope='world'), 
    sliders=sliders, 
    title_text = 'COVID-19 Trend Analysis (World)' + '<br>' + '(Hover for breakdown)'
)
fig = dict(data=data_slider, layout=lyt)
plotly.offline.iplot(fig)


### 13 - COVID-19 Tweets dataset

I also tried this one: https://github.com/lopezbec/COVID19_Tweets_Dataset
It would be interesting to work on this dataset but the tweets are unreadable and they first have to be computed out (twarc and hydrate). This process takes time.
To cope with those points, 2.000 tweets/day are randomly picked up.

In [15]:
from twarc import Twarc
import os

consumer_key = "6ojMMS2mWsHgHA9aDwrMk9WG5"
consumer_secret = "KJB6JmTaFKpHwOSAdpIpYibf8SHe6r2qzzFsOunSqfToO0QXS0"
access_token = "968116193663049729-AcELLvOcLGYyhU2MPB1C0TZBLRstqrV"
access_token_secret = "2VvILAdL5F5keEe8hiJuKfuuizUQoQ15nJNJdPgSv3fmy"
t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

A colab code (https://colab.research.google.com/drive/1reOEk79e8XR5etY6QZBGTkS769X5lEyk) is used to generate a human-readable dataset. But the process using twarc and hydrate is greedy regarding that a day is about 1.5 million tweets.

In [21]:
#@title Check Keywords to Hydrate { run: "auto" }
coronavirus = True #@param {type:"boolean"}
virus = False #@param {type:"boolean"}
covid = True #@param {type:"boolean"}
ncov19 = False #@param {type:"boolean"}
ncov2019 = False #@param {type:"boolean"}
keyword_dict = {"coronavirus": coronavirus, "virus": virus, "covid": covid, "ncov19": ncov19, "ncov2019": ncov2019}

Let's generate tweets from january $1^{st}$ to may $15^{th}$

In [55]:
#@title Enter range of dates to Hydrate { run: "auto" }
start_date = '2020-03-01' #@param {type:"date"}
end_date = '2020-05-01' #@param {type:"date"}


import datetime as dt
files = []
covid_loc = "COVID19_Tweets_Dataset"
# Looks at each volder
for folder in os.listdir(covid_loc):
    foldername = os.fsdecode(folder)
    # The folder name is a keyword. We continue for keywords selected above
    if keyword_dict.get(foldername.split()[0].lower()) == True:
        folderpath = os.path.join(covid_loc, foldername)
        # Each file is of the format [keyword]_yyyy_mm_dd.txt
        for file in os.listdir(folderpath):
            filename = os.fsdecode(file)
            date = filename[filename.index("_")+1:filename.index(".")]

            # If the date is within the required range, it is added to the
            # list of files to read.
            if (dt.datetime.strptime(start_date, "%Y-%m-%d").date() 
            <= dt.datetime.strptime(date, '%Y_%m_%d').date()
             <= dt.datetime.strptime(end_date, "%Y-%m-%d").date()):
                files.append(os.path.join(folderpath, filename))
# The final list is read, and each of the individual IDs is stored in a collective
# set of IDs. Duplicates are removed.
ids = []
for filename in files:
    with open(filename) as f:
        # The files are of the format: [id1,id2,id3,...,idn]
        # Remove the brackets and split on commas
        ids1 = []
        for i in f.readline().strip('][').replace(" ", "").split(","):
            ids1.append(i)
        ids.append(random.choices(ids1,k=nb_tweets_max))   
# Number of tweets read.
print(round((len(ids)/1000), 3), "thousand unique tweets.")

0.114 thousand unique tweets.


In [56]:
#@title Enter ID output file {run: "auto"}
final_tweet_ids_filename = "final_ids.txt" #@param {type: "string"}
# The set of IDs is stored in this file.
with open(final_tweet_ids_filename, "w+") as f:
    for id in ids:
        f.write('%s\n' % id)

In [57]:
#@title Set up Directory { run: "auto"}
final_tweet_ids_filename = "final_ids.txt" #@param {type: "string"}
output_filename = "output.csv" #@param {type: "string"}

Due to the high number of tweets per day (1.5 million), they can not all be hydrated. Let randomly pick up 2000 tweets per day.

$\textbf{DO NOT WORK}$

In [61]:
import jsonlines, json
# Stores hydrated tweets here as jsonl objects
# Contains one json object per line
output_json_filename = output_filename[:output_filename.index(".")] + ".txt"
ids = []

with open(final_tweet_ids_filename, "r") as ids_file:
    ids = ids_file.read().split()
    hydrated_tweets = []
    ids_to_hydrate = set(ids)

# Looks at the output file for already hydrated tweets
if os.path.isfile(output_json_filename):
    with jsonlines.open(output_json_filename, "r") as reader:
        for i in reader.iter(type=dict, skip_invalid=True):
            # These tweets have already been hydrated. So remove them from ids_to_hydrate
            hydrated_tweets.append(i)
            ids_to_hydrate.remove(i["id_str"])
print("Total IDs: " + str(len(ids)) + ", IDs to hydrate: " + str(len(ids_to_hydrate)))
print("Hydrated: " + str(len(hydrated_tweets)))

count = len(hydrated_tweets)
start_index = count # The index from where tweets haven't been saved to the output_json_file
# Stores hydrated tweets to output_json_file every num_save iterations.
num_save  = 1000


# Now, use twarc and start hydrating

for tweet in t.hydrate(ids_to_hydrate):
    hydrated_tweets.append(tweet)
    count += 1
    # If num_save iterations have passed,
    if (count % num_save) == 0:
        # Open the output file
        # NOTE: Even if the code stops during IO, only tweets from the current iteration are lost.
        # Older tweets are preserved as the file is written in append mode.
        with jsonlines.open(output_json_filename, "a") as writer:
            print("Started IO")
            # Now write the tweets from start_index. The other tweets don't have to be written
            # as they were already written in a previous iteration or run.
            for hydrated_tweet in hydrated_tweets[start_index:]:
                writer.write(hydrated_tweet)
            print("Finished IO")
        print("Saved " + str(count) + " hydrated tweets.")
        # Now, since everything has been written. Reset start_index
        start_index = count


# There might be tweets unwritten in the last iteration if the count is not a multiple of num_tweets.
# In that case, just write out the remainder of tweets.
if count != start_index:
    print("Here with start_index", start_index)
    with jsonlines.open(output_json_filename, "a") as writer:
        for hydrated_tweet in hydrated_tweets[start_index:]:
           writer.write(hydrated_tweet)   

KeyError: '1236443419335176192'

In [62]:
# Convert jsonl to csv
import csv, jsonlines
output_json_filename = output_filename[:output_filename.index(".")] + ".txt"
# These are the column name that are selected to be stored in the csv
keyset = ["created_at", "id", "id_str", "full_text", "source", "truncated", "in_reply_to_status_id",
          "in_reply_to_status_id_str", "in_reply_to_user_id", "in_reply_to_user_id_str", 
          "in_reply_to_screen_name", "user", "coordinates", "place", "quoted_status_id",
          "quoted_status_id_str", "is_quote_status", "quoted_status", "retweeted_status", 
          "quote_count", "reply_count", "retweet_count", "favorite_count", "entities", 
          "extended_entities", "favorited", "retweeted", "possibly_sensitive", "filter_level", 
          "lang", "matching_rules", "current_user_retweet", "scopes", "withheld_copyright", 
          "withheld_in_countries", "withheld_scope", "geo", "contributors", "display_text_range",
          "quoted_status_permalink"]
hydrated_tweets = []
# Reads the current tweets
with jsonlines.open(output_json_filename, "r") as reader:
    for i in reader.iter(type=dict, skip_invalid=True):
        hydrated_tweets.append(i)
# Writes them out
with  open(output_filename, "w+") as output_file:
    d = csv.DictWriter(output_file, keyset)
    d.writeheader()
    d.writerows(hydrated_tweets)

## 14-GeoCOVID

A NLP Geolocalized dataset provided by : https://crisisnlp.qcri.org/covid19


In [1]:
import json
import pandas as pd

data = pd.read_json("/home/urendil/Documents/01-ENSTA/Cours/PRE/COVID/GeoCOVID/geo_2020-02-01.json",encoding='unicode_escape')
data.head()

ValueError: Trailing data