# BERT and topic modelling

In [27]:
import pandas as pd

import torch
from torch.nn.functional import softmax

from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import random



In [2]:
input_file_path = '/Users/thebekhruz/Desktop/100Days-Of-Code/100-Days-of-NLP-Odyssey/data/intermediate/preprocessing/processed_data_descr.csv'
df = pd.read_csv(input_file_path, delimiter='\t', header=None, names=['doc_id', 'type', 'value'])
df.head()

Unnamed: 0,doc_id,type,value
0,a7bb9917-95ff-3f55-a640-4c5afcec25f2,title,View towards SE of junction of Queen Victoria ...
1,a7bb9917-95ff-3f55-a640-4c5afcec25f2,description,E corner of Queen Victoria Rd at junction with...
2,a7bb9917-95ff-3f55-a640-4c5afcec25f2,mentions,https://www.wikidata.org/wiki/Q64116
3,a7bb9917-95ff-3f55-a640-4c5afcec25f2,mentions,https://www.wikidata.org/wiki/Q64116
4,c29a7b77-7c46-3b85-88fe-05c8f4b2e384,title,"Front page of Bucks Free Press, Time capsule f..."


In [3]:
# df.rename(columns = {'doc_id':'doc_ID', 'type':'text_input'}, inplace = True) 
# df.head()

In [4]:
# Filter to only include titles
title_df = df[df['type'] == 'title'].set_index('doc_id')['value'].to_frame(name='title')
title_df.head()


Unnamed: 0_level_0,title
doc_id,Unnamed: 1_level_1
a7bb9917-95ff-3f55-a640-4c5afcec25f2,View towards SE of junction of Queen Victoria ...
c29a7b77-7c46-3b85-88fe-05c8f4b2e384,"Front page of Bucks Free Press, Time capsule f..."
196c11e6-f7b6-392f-ae41-28653345087c,"High Wycombe Police Station, in Queen Victoria..."
7a5aace6-2398-3dcf-8843-37ff6ccea875,"Reference Library door, Queen Victoria Rd, Hig..."
c66c4715-c03a-3aab-964b-e733f3ff1cf4,"Terrace of brick and flint cottages, Beech Rd,..."


In [5]:
# Filter to only include descriptions
descr_df = df[df['type'] == 'description'].set_index('doc_id')['value'].to_frame(name='description')
title_df.head()

Unnamed: 0_level_0,title
doc_id,Unnamed: 1_level_1
a7bb9917-95ff-3f55-a640-4c5afcec25f2,View towards SE of junction of Queen Victoria ...
c29a7b77-7c46-3b85-88fe-05c8f4b2e384,"Front page of Bucks Free Press, Time capsule f..."
196c11e6-f7b6-392f-ae41-28653345087c,"High Wycombe Police Station, in Queen Victoria..."
7a5aace6-2398-3dcf-8843-37ff6ccea875,"Reference Library door, Queen Victoria Rd, Hig..."
c66c4715-c03a-3aab-964b-e733f3ff1cf4,"Terrace of brick and flint cottages, Beech Rd,..."


In [10]:
merged_df = title_df.merge(descr_df, left_index=True, right_index=True, how='inner')
merged_df.rename(columns = {'doc_id':'DOC_ID', 'title':'TITLE', 'description':'DESCRIPTION'}, inplace = True) 

merged_df['MERGED_DATA'] = merged_df['TITLE'] + '. ' + merged_df['DESCRIPTION']
merged_df.head()



Unnamed: 0_level_0,TITLE,DESCRIPTION,MERGED_DATA
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a7bb9917-95ff-3f55-a640-4c5afcec25f2,View towards SE of junction of Queen Victoria ...,E corner of Queen Victoria Rd at junction with...,View towards SE of junction of Queen Victoria ...
c29a7b77-7c46-3b85-88fe-05c8f4b2e384,"Front page of Bucks Free Press, Time capsule f...",Front page Bucks Free Press The Time Capsule T...,"Front page of Bucks Free Press, Time capsule f..."
196c11e6-f7b6-392f-ae41-28653345087c,"High Wycombe Police Station, in Queen Victoria...",New Police Station High Wycombe viewed from op...,"High Wycombe Police Station, in Queen Victoria..."
7a5aace6-2398-3dcf-8843-37ff6ccea875,"Reference Library door, Queen Victoria Rd, Hig...",Corridor entrance to Reference Library,"Reference Library door, Queen Victoria Rd, Hig..."
c66c4715-c03a-3aab-964b-e733f3ff1cf4,"Terrace of brick and flint cottages, Beech Rd,...",Terrace of brick and flint cottages,"Terrace of brick and flint cottages, Beech Rd,..."


In [15]:
DOCUMENTS = merged_df['MERGED_DATA'].tolist()
print(DOCUMENTS[:5], sep='\n')

['View towards SE of junction of Queen Victoria Road with High St and Easton St, High Wycombe, October 1936. E corner of Queen Victoria Rd at junction with Easton St and High St, High Wycombe', 'Front page of Bucks Free Press, Time capsule for Clock House (formerly Arts School) Frogmoor, High Wycombe. March 1985. Front page Bucks Free Press The Time Capsule The 7th July 1893 Other papers to be included in Clock House Time Capsule', 'High Wycombe Police Station, in Queen Victoria Road, High Wycombe. Oct 1935. New Police Station High Wycombe viewed from opposite side of the road', 'Reference Library door, Queen Victoria Rd, High Wycombe. about 1992. Corridor entrance to Reference Library', 'Terrace of brick and flint cottages, Beech Rd, Wycombe Marsh. about 1935. Terrace of brick and flint cottages']


<!-- ## Splitting the dataset into training and testing sets
 -->


In [6]:
# documents_train, documents_test = train_test_split(DOCUMENTS, test_size=0.1, random_state=32)


## Load pre-trained model and tokenizer

> bert-base-uncased. Learn more at [link](https://huggingface.co/google-bert/bert-base-uncased)


In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(CATEGORIES))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Example.

In [35]:
# Example of use
CATEGORIES = {
    'community': 'Community Events and Social Gatherings - Includes photos of community events, social gatherings, parades, and celebrations.',
    'architecture': 'Architectural and Urban Landscapes - Focuses on buildings, streets, urban landscapes, and architectural details.',
    'historical': 'Historical and Cultural Moments - Captures significant historical and cultural moments, including ceremonies and commemorations.',
    'natural': 'Natural Landscapes and Scenic Views - Showcases natural landscapes, parks, rivers, and scenic views.',
    'personal': 'Personal and Family Life - Centers on photos of individuals, families, and daily life, reflecting personal stories and moments.',
    'education': 'Educational and Institutional - Relates to education, institutions, and formal gatherings.',
    'industrial': 'Industrial and Technological Progress - Documents industrial scenes, technological advancements, and construction.'
}


# Tokenize the document
for i in range(10):
    random_index = random.randrange(20000)
    doc_eg = DOCUMENTS[random_index]

    inputs = tokenizer(doc_eg, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Predict category with BERT
    with torch.no_grad():
        logits = model(**inputs).logits
        probabilities = softmax(logits, dim=1)
        predicted_index = probabilities.argmax()
        predicted_category_key = list(CATEGORIES.keys())[predicted_index]
        predicted_category_description = CATEGORIES[predicted_category_key]

    print(f"Document {random_index}:\n{doc_eg}\n")
    print(f"Document is classified as:--- {predicted_category_key} --- {predicted_category_description} ---\n")



Document 6232:
Looking NE, a view of the premises of J G Peace (Town House), and The Chantry and other shops, Castle Street, High Wycombe. c1970. Castle Street, north side, J G Peace in the Town House, also The Chantry and row of shops

Document is classified as:--- historical --- Historical and Cultural Moments - Captures significant historical and cultural moments, including ceremonies and commemorations. ---

Document 14037:
Looking South, a view of the King George V Public House in London Road, Wycombe Marsh. c1999. London Rd at Wycombe Marsh, a view of the King George V PH

Document is classified as:--- historical --- Historical and Cultural Moments - Captures significant historical and cultural moments, including ceremonies and commemorations. ---

Document 4804:
Looking W, a view of the northern side of the street, including the Post Office and the Church Loft, High St, West Wycombe. c 1952. View of mainly the northern side of the street, including the Post Office and the Church

In [36]:
batch_size = 5
for i in range(0, len(DOCUMENTS), batch_size):
    batch_docs = DOCUMENTS[i:i+batch_size]
    inputs = tokenizer(batch_docs, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Predict category with BERT
    with torch.no_grad():
        logits = model(**inputs).logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_indices = probabilities.argmax(dim=1).cpu().numpy()  # Ensure indices are on CPU for numpy compatibility

    for j, idx in enumerate(predicted_indices):
        predicted_category_key = list(CATEGORIES.keys())[idx]
        predicted_category_description = CATEGORIES[predicted_category_key]
        print(f"Document {i+j+1}:\n{batch_docs[j]}\n")
        print(f"Document is classified as:--- {predicted_category_key} --- {predicted_category_description} ---\n")

Document 1:
View towards SE of junction of Queen Victoria Road with High St and Easton St, High Wycombe, October 1936. E corner of Queen Victoria Rd at junction with Easton St and High St, High Wycombe

Document is classified as:--- historical --- Historical and Cultural Moments - Captures significant historical and cultural moments, including ceremonies and commemorations. ---

Document 2:
Front page of Bucks Free Press, Time capsule for Clock House (formerly Arts School) Frogmoor, High Wycombe. March 1985. Front page Bucks Free Press The Time Capsule The 7th July 1893 Other papers to be included in Clock House Time Capsule

Document is classified as:--- historical --- Historical and Cultural Moments - Captures significant historical and cultural moments, including ceremonies and commemorations. ---

Document 3:
High Wycombe Police Station, in Queen Victoria Road, High Wycombe. Oct 1935. New Police Station High Wycombe viewed from opposite side of the road

Document is classified as:-

ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [None]:
print(doc_eg)

View towards SE of junction of Queen Victoria Road with High St and Easton St, High Wycombe, October 1936
