In [14]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from io import StringIO
import os
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
from nltk import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:

# Specify the path to the directory containing the data
data_directory = '../50.021-ai-event-detection/data'

# Construct the absolute path to the JSONL file
jsonl_path = os.path.join(data_directory, 'train.jsonl')

# Read the content of the JSONL file
with open(jsonl_path, 'r', encoding='utf-8') as file:
    json_content = file.read()

# Wrap the JSON content in a StringIO object
json_io = StringIO(json_content)

# Read the JSONL content from StringIO object
df = pd.read_json(json_io, lines=True)

# Display the DataFrame
print(df.head())


                                     title                                id  \
0  2006 Pangandaran earthquake and tsunami  8307a6b61b84d4eea42c1dd5e6e2cdba   
1             Battle of Santa Clara (1927)  387fe1dfe55067eb29e1fd4116d37af3   
2              Siege of Pondicherry (1793)  268c4763208c87ed7ebf55565c274d23   
3                        Battle of Leuthen  c95e68565081126b5c949117e423695a   
4           Glasgow St Enoch rail accident  3bec0b60c0940c5e46ee2cfc9504df92   

                                             content  \
0  [{'sentence': 'The 2006 Pangandaran earthquake...   
1  [{'sentence': 'The Battle of Santa Clara took ...   
2  [{'sentence': '"For other sieges with this nam...   
3  [{'sentence': 'The Battle of Leuthen was fough...   
4  [{'sentence': 'The Glasgow St Enoch rail accid...   

                                              events  \
0  [{'id': '40b3b20bc2eeb6b163538b82c1379ead', 't...   
1  [{'id': '966f55ccc3fc199e066929414c392266', 't...   
2  [{'id': 'f2

In [17]:
# shape of the dataset
print(df.shape)
# total number of unique categories
print("Unique categories:",df['title'].nunique())
print("-------------------------------------------------")
# information about metadata
df.info()

(2913, 5)
Unique categories: 2913
-------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2913 entries, 0 to 2912
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              2913 non-null   object
 1   id                 2913 non-null   object
 2   content            2913 non-null   object
 3   events             2913 non-null   object
 4   negative_triggers  2913 non-null   object
dtypes: object(5)
memory usage: 113.9+ KB


In [18]:
# drop irrelevant columns
df['type'] = df['events'].apply(lambda x: x[0]['type'] if x else None)
new_df = df[['title']]
new_df.head()

Unnamed: 0,title
0,2006 Pangandaran earthquake and tsunami
1,Battle of Santa Clara (1927)
2,Siege of Pondicherry (1793)
3,Battle of Leuthen
4,Glasgow St Enoch rail accident


In [19]:
print(new_df)

                                        title
0     2006 Pangandaran earthquake and tsunami
1                Battle of Santa Clara (1927)
2                 Siege of Pondicherry (1793)
3                           Battle of Leuthen
4              Glasgow St Enoch rail accident
...                                       ...
2908           1979 Football League Cup Final
2909                   K-1 Premium Dynamite!!
2910                       2002 Hebron ambush
2911       Minneapolis general strike of 1934
2912                      Spanair Flight 5022

[2913 rows x 1 columns]


In [20]:
# Split the string into words using split() and then get the length of resulting list of title
new_df['word_count'] = new_df['title'].str.split().str.len()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['word_count'] = new_df['title'].str.split().str.len()


In [21]:
print(new_df)

                                        title  word_count
0     2006 Pangandaran earthquake and tsunami           5
1                Battle of Santa Clara (1927)           5
2                 Siege of Pondicherry (1793)           4
3                           Battle of Leuthen           3
4              Glasgow St Enoch rail accident           5
...                                       ...         ...
2908           1979 Football League Cup Final           5
2909                   K-1 Premium Dynamite!!           3
2910                       2002 Hebron ambush           3
2911       Minneapolis general strike of 1934           5
2912                      Spanair Flight 5022           3

[2913 rows x 2 columns]


In [22]:
# Get character length of the title
new_df['character_count'] = new_df['title'].str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['character_count'] = new_df['title'].str.len()


In [23]:
print(new_df)

                                        title  word_count  character_count
0     2006 Pangandaran earthquake and tsunami           5               39
1                Battle of Santa Clara (1927)           5               28
2                 Siege of Pondicherry (1793)           4               27
3                           Battle of Leuthen           3               17
4              Glasgow St Enoch rail accident           5               30
...                                       ...         ...              ...
2908           1979 Football League Cup Final           5               30
2909                   K-1 Premium Dynamite!!           3               22
2910                       2002 Hebron ambush           3               18
2911       Minneapolis general strike of 1934           5               34
2912                      Spanair Flight 5022           3               19

[2913 rows x 3 columns]


In [24]:
# Function to generate n-grams for the title
def generate_ngrams(text, n):
    # Tokenize the text into words
    tokens = word_tokenize(text)
    # Generate n-grams
    return list(ngrams(tokens, n))

In [25]:
n = 2
new_df['bigrams'] = new_df['title'].apply(lambda title: generate_ngrams(title, n))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['bigrams'] = new_df['title'].apply(lambda title: generate_ngrams(title, n))


In [26]:
print(new_df)

                                        title  word_count  character_count  \
0     2006 Pangandaran earthquake and tsunami           5               39   
1                Battle of Santa Clara (1927)           5               28   
2                 Siege of Pondicherry (1793)           4               27   
3                           Battle of Leuthen           3               17   
4              Glasgow St Enoch rail accident           5               30   
...                                       ...         ...              ...   
2908           1979 Football League Cup Final           5               30   
2909                   K-1 Premium Dynamite!!           3               22   
2910                       2002 Hebron ambush           3               18   
2911       Minneapolis general strike of 1934           5               34   
2912                      Spanair Flight 5022           3               19   

                                                bigrams  
0    

In [28]:
# Use CountVectorizer to convert a collection of text documents to a matrix of token counts
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(new_df['title'])

# Set the number of topics
n_topics = 5

# Initialize LDA
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda.fit(X)

# Get the words that are most common in each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

Topic #0:
war flight hurricane battle tour air airlines bombing second siege
Topic #1:
hurricane massacre festival 2011 2005 summer olympics tour 2014 2000
Topic #2:
hurricane action siege crash revolution 2010 2013 disaster rail massacre
Topic #3:
operation festival war invasion hurricane civil indian games north cyclone
Topic #4:
battle world cup storm tour tropical final league championship 2009
