### Load the Dataset

In [1]:
import pandas as pd

# Load the dataset
file_path = 'dataset/data.tsv'
data = pd.read_csv(file_path, sep='\t', header=None, names=['Token', 'POS', 'NER'], skip_blank_lines=False)

# Display the first few rows
print(data.head(20))

                                                Token    POS    NER
0   শনিবার (২৭ আগস্ট) রাতে পটুয়াখালী সদর থানার ভা...    NaN    NaN
1                                              শনিবার    NNP  B-D&T
2                                                 (২৭  PUNCT  B-OTH
3                                              আগস্ট)    NNP  B-D&T
4                                                রাতে    NNC  B-D&T
5                                          পটুয়াখালী    NNP  B-GPE
6                                                 সদর    NNC  I-GPE
7                                               থানার    NNC  I-GPE
8                                          ভারপ্রাপ্ত    ADJ  B-PER
9                                           কর্মকর্তা    NNC  I-PER
10                                              (ওসি)  PUNCT  B-OTH
11                                               মো.    NNP  B-PER
12                                       মনিরুজ্জামান    NNP  I-PER
13                                              

In [2]:
data.shape

(53438, 3)

### Clean and Preprocess the Data

In [3]:
data.isna().sum()

Token    4054
POS      8112
NER      8114
dtype: int64

In [4]:
# Count the number of unique sentences
# A sentence is identified by rows where Token, POS and NER are NaN
blank_rows = data[data['Token'].isna() & data['POS'].isna() & data['NER'].isna()].shape[0]

print(f"Number of sentences in the raw dataset: {blank_rows}")

Number of sentences in the raw dataset: 4054


In [5]:
data_cleaned = data.dropna(how='all')
data_cleaned.reset_index(drop=True, inplace=True)

missing_values = data_cleaned.isna().sum()
missing_values

Token       0
POS      4058
NER      4060
dtype: int64

In [6]:
num_sentences = data_cleaned[data_cleaned['POS'].isna() & data_cleaned['NER'].isna()].shape[0]
num_sentences

4058

In [7]:
data_cleaned.tail()

Unnamed: 0,Token,POS,NER
49379,সফর,NNC,B-OTH
49380,বিনিময়ের,NNC,B-OTH
49381,উপর,PP,B-OTH
49382,গুরুত্বারোপ,NNC,B-OTH
49383,করেন।,VF,B-OTH


In [8]:
# Display rows where 'POS' or 'NER' are NaN
rows_with_missing_tags = data_cleaned[data_cleaned['POS'].isna() | data_cleaned['NER'].isna()]
rows_with_missing_tags.head(5)

Unnamed: 0,Token,POS,NER
0,শনিবার (২৭ আগস্ট) রাতে পটুয়াখালী সদর থানার ভা...,,
17,বায়ুদূষণ ও স্মার্ট ফোন ছেলেমেয়ে উভয়ের প্রজনন ক...,,
29,ছাত্র রাজনীতির বর্তমান অবস্থার শুরু হয়েছিলো ...,,
40,"শাকিল রাজধানীর ৩০০ ফিট, দিয়াবাড়ি ও পূর্বাচল ...",,
57,সম্প্রতি ক্লাবের নবীন ব্যবস্থাপনা প্রশিক্ষণার্...,,


In [9]:
# Fill missing values in 'POS' and 'NER' with 'UNKNOWN'
data_cleaned = data_cleaned.dropna(subset=['POS', 'NER'])

missing_values_filled = data_cleaned.isna().sum()
missing_values_filled

Token    0
POS      0
NER      0
dtype: int64

In [10]:
data_cleaned.head()

Unnamed: 0,Token,POS,NER
1,শনিবার,NNP,B-D&T
2,(২৭,PUNCT,B-OTH
3,আগস্ট),NNP,B-D&T
4,রাতে,NNC,B-D&T
5,পটুয়াখালী,NNP,B-GPE


In [11]:
data_cleaned.shape

(45324, 3)

### More Exploratory Analysis

In [12]:
# Distribution of tokens
token_distribution = data_cleaned['Token'].value_counts()
print("Top 20 Tokens:")
print(token_distribution.head(20))

Top 20 Tokens:
ও           442
বলেন,       321
থেকে        285
এ           268
করে         251
করা         228
এই          215
এবং         174
জন্য        168
তিনি        160
এক          159
একটি        155
হয়েছে।     130
করতে        129
:           126
বাংলাদেশ    110
না          109
হবে।        106
মধ্যে       105
হয়।        103
Name: Token, dtype: int64


In [13]:
# Distribution of POS tags
pos_distribution = data_cleaned['POS'].value_counts()
print("POS Tag Distribution:")
print(pos_distribution)

POS Tag Distribution:
NNC      17803
NNP       7544
ADJ       4581
VF        4417
QF        1971
PP        1866
VNF       1602
ADV       1461
PRO       1329
CONJ       947
PUNCT      859
DET        773
PART        85
OTH         67
INTJ        19
Name: POS, dtype: int64


In [14]:
# Distribution of NER tags
ner_distribution = data_cleaned['NER'].value_counts()
print("NER Tag Distribution:")
print(ner_distribution)

NER Tag Distribution:
B-OTH      30932
B-PER       3565
B-ORG       1575
I-PER       1426
B-NUM       1314
I-ORG       1151
B-GPE        997
B-D&T        996
I-D&T        787
B-EVENT      551
B-LOC        460
B-UNIT       301
I-NUM        277
I-EVENT      253
B-MISC       249
I-LOC        232
B-T&T        106
I-T&T         52
I-GPE         51
I-MISC        37
I-UNIT        12
Name: NER, dtype: int64


In [15]:
# Define a threshold for infrequent tags (e.g., less than a certain count)
threshold = 5

# Identify infrequent POS tags
infrequent_pos_tags = pos_distribution[pos_distribution < threshold].index

# Identify infrequent NER tags
infrequent_ner_tags = ner_distribution[ner_distribution < threshold].index

print("Infrequent POS Tags:")
print(infrequent_pos_tags)

print("Infrequent NER Tags:")
print(infrequent_ner_tags)


Infrequent POS Tags:
Index([], dtype='object')
Infrequent NER Tags:
Index([], dtype='object')


In [16]:
# Remove rows with infrequent POS or NER tags
data_cleaned = data_cleaned[~data_cleaned['POS'].isin(infrequent_pos_tags) & ~data_cleaned['NER'].isin(infrequent_ner_tags)]
data_cleaned.reset_index(drop=True, inplace=True)
data_cleaned.head(5)

Unnamed: 0,Token,POS,NER
0,শনিবার,NNP,B-D&T
1,(২৭,PUNCT,B-OTH
2,আগস্ট),NNP,B-D&T
3,রাতে,NNC,B-D&T
4,পটুয়াখালী,NNP,B-GPE


In [17]:
data_cleaned.shape

(45324, 3)

In [18]:
# Summary of distributions
print("\nToken Distribution Summary:")
print(token_distribution.describe())

print("\nPOS Tag Distribution Summary:")
print(pos_distribution.describe())

print("\nNER Tag Distribution Summary:")
print(ner_distribution.describe())


Token Distribution Summary:
count    16247.000000
mean         2.789684
std          9.016175
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max        442.000000
Name: Token, dtype: float64

POS Tag Distribution Summary:
count       15.000000
mean      3021.600000
std       4578.532622
min         19.000000
25%        816.000000
50%       1461.000000
75%       3194.000000
max      17803.000000
Name: POS, dtype: float64

NER Tag Distribution Summary:
count       21.000000
mean      2158.285714
std       6643.519467
min         12.000000
25%        232.000000
50%        460.000000
75%       1151.000000
max      30932.000000
Name: NER, dtype: float64


### Tokenize and Pad Sequences

In [19]:
from bnlp import NLTKTokenizer

bnltk = NLTKTokenizer()

text = "আমি ভাত খাই। সে বাজারে যায়। তিনি কি সত্যিই ভালো মানুষ?"
word_tokens = bnltk.word_tokenize(text)
sentence_tokens = bnltk.sentence_tokenize(text)
print(word_tokens)
print(sentence_tokens)

Vocabulary size: 15338


In [20]:
tokenizer_token

<keras.preprocessing.text.Tokenizer at 0x248ed9dc9c8>

In [21]:
# Initialize Tokenizer for POS tags
tokenizer_pos = tf.keras.preprocessing.text.Tokenizer(lower=False, oov_token='<OOV>')
tokenizer_pos.fit_on_texts(data_cleaned['POS'])

# Convert POS tags to sequences
pos_sequences = tokenizer_pos.texts_to_sequences(data_cleaned['POS'])
pos_sequences = tf.keras.preprocessing.sequence.pad_sequences(pos_sequences, padding='post')

# Initialize Tokenizer for NER tags
tokenizer_ner = tf.keras.preprocessing.text.Tokenizer(lower=False, oov_token='<OOV>')
tokenizer_ner.fit_on_texts(data_cleaned['NER'])

# Convert NER tags to sequences
ner_sequences = tokenizer_ner.texts_to_sequences(data_cleaned['NER'])
ner_sequences = tf.keras.preprocessing.sequence.pad_sequences(ner_sequences, padding='post')

# Print the size of the tokenizers' vocabularies
print(f"POS Vocabulary size: {len(tokenizer_pos.word_index)}")
print(f"NER Vocabulary size: {len(tokenizer_ner.word_index)}")


POS Vocabulary size: 16
NER Vocabulary size: 14


In [22]:
# Check lengths of sequences
print(f"Token sequence length: {sequences.shape[1]}")
print(f"POS sequence length: {pos_sequences.shape[1]}")
print(f"NER sequence length: {ner_sequences.shape[1]}")

# Verify that all sequences have the same length
assert sequences.shape[1] == pos_sequences.shape[1] == ner_sequences.shape[1], "Sequences have different lengths!"

Token sequence length: 41
POS sequence length: 1
NER sequence length: 3


AssertionError: Sequences have different lengths!

In [24]:
import pandas as pd
import tensorflow as tf
import numpy as np

# Reset index to ensure proper grouping
data_cleaned.reset_index(drop=True, inplace=True)

# Add a unique sentence ID for grouping
data_cleaned['sentence_id'] = data_cleaned.index.to_series().diff().ne(1).cumsum()

# Group data by sentence_id
grouped_data = data_cleaned.groupby('sentence_id')

# Initialize lists for processed data
token_sequences = []
pos_sequences = []
ner_sequences = []

# Process each group (i.e., sentence) individually
for _, group in grouped_data:
    tokens = group['Token'].tolist()
    pos_tags = group['POS'].tolist()
    ner_tags = group['NER'].tolist()
    
    # Tokenize each list
    token_seq = tokenizer_token.texts_to_sequences([tokens])[0]
    pos_seq = tokenizer_pos.texts_to_sequences([pos_tags])[0]
    ner_seq = tokenizer_ner.texts_to_sequences([ner_tags])[0]
    
    # Determine max length for padding
    max_len = max(len(token_seq), len(pos_seq), len(ner_seq))
    
    # Pad sequences to ensure consistent length
    token_seq = tf.keras.preprocessing.sequence.pad_sequences([token_seq], maxlen=max_len, padding='post')[0]
    pos_seq = tf.keras.preprocessing.sequence.pad_sequences([pos_seq], maxlen=max_len, padding='post')[0]
    ner_seq = tf.keras.preprocessing.sequence.pad_sequences([ner_seq], maxlen=max_len, padding='post')[0]
    
    # Append to lists
    token_sequences.append(token_seq)
    pos_sequences.append(pos_seq)
    ner_sequences.append(ner_seq)

# Convert lists to numpy arrays
token_sequences = np.array(token_sequences)
pos_sequences = np.array(pos_sequences)
ner_sequences = np.array(ner_sequences)

# Print the shapes of the arrays to verify
print(f"Token sequences shape: {token_sequences.shape}")
print(f"POS sequences shape: {pos_sequences.shape}")
print(f"NER sequences shape: {ner_sequences.shape}")

Token sequences shape: (1, 45324)
POS sequences shape: (1, 45324)
NER sequences shape: (1, 45324)
