<a href="https://colab.research.google.com/github/yehyifan/Text_Classification_with_NLP_and_PoS_Tagging/blob/main/Text_Classification_with_NLP_and_PoS_Tagging_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Classification with NLP & PoS Tagging  
Built a natural language processing pipeline to convert English sentences into musical "emoji symphonies" by mapping each word to an emoji based on its part of speech. Used **NLTK** for tokenization, stopword removal, PoS tagging, and **WordNet** for lexical exploration. Designed a system to visualize syntactic patterns creatively through emoji encoding, enhancing understanding of sentence structure and linguistic roles.


## Module Import and Data Loading

In [None]:
import numpy as np
import string
import pandas as pd
import re
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from collections import defaultdict, Counter
from ast import literal_eval
import pickle
# nltk imports, note that these outputs may be different if you are using colab or local jupyter notebooks
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import wordnet as wn
from nltk.tag import pos_tag
stops = set(stopwords.words('english'))
from tqdm import tqdm
from urllib import request
module_url = [f"https://drive.google.com/uc?export=view&id=179zVybyc3cCCS5IG9F-STx2Z8pzrGQZ6",
              f"https://drive.google.com/uc?export=view&id=1MH56eN-QXyqny35VzbddI7MxA-oGO0IT"]
name = ['big.txt', 'sorted_symphs.pkl']
for i in range(len(name)):
    with request.urlopen(module_url[i]) as f, open(name[i],'w') as outf:
        a = f.read()
        outf.write(a.decode('ISO-8859-1'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
def get_rhythm(line):
    """
    Convert a line of text into a sequence of emojis based on POS tags.

    Args:
        line: A string of text to process.

    Returns:
        List of tuples, each containing:
        - the original sentence
        - the POS-tagged sentence
        - the emoji sequence representing the sentence's rhythm
    """
    # Apply sent_tokenize to the line
    sentences = sent_tokenize(line)
    results = []

    for original_sentence in sentences:
        # Tokenize and POS tag the sentence
        tokens = word_tokenize(original_sentence)
        pos_tagged_sentence = pos_tag(tokens)

        # Create emoji sequence based on POS tags
        emoji_sequence = ''
        for word, tag in pos_tagged_sentence:
            if word.lower() in stops:
                emoji_sequence += '🎵'
            elif tag.startswith('N'):
                emoji_sequence += '🎸'
            elif tag.startswith('V'):
                emoji_sequence += '🎹'
            elif tag.startswith('J'):
                emoji_sequence += '🎷'
            else:
                emoji_sequence += '🎤'

        # Add to results
        results.append((original_sentence, pos_tagged_sentence, emoji_sequence))

    return results

def get_symphonies(corpus_path):
    """
    Build and sort emoji-based rhythm patterns from a text corpus.

    Args:
        corpus_path: Path to the text corpus file.

    Returns:
        List of tuples:
        - emoji sequence (str)
        - list of unique POS-tagged sentences matching that sequence, sorted by group size
    """
    # Dictionary to store emoji sequences and their corresponding tagged sentences
    symphony_dict = defaultdict(list)

    # Process the corpus line by line
    with open(corpus_path, 'r', encoding='ISO-8859-1') as file:
        for line in file:
            if not line.isspace():
                rhythm_results = get_rhythm(line)

                for _, pos_tagged_sentence, emoji_sequence in rhythm_results:
                    # Check if this exact POS-tagged sentence is already in the list
                    if pos_tagged_sentence not in symphony_dict[emoji_sequence]:
                        symphony_dict[emoji_sequence].append(pos_tagged_sentence)

    # Sort by the number of sentences each emoji sequence produces (largest first)
    sorted_symphs = sorted(symphony_dict.items(), key=lambda x: len(x[1]), reverse=True)

    return sorted_symphs

sorted_symphs = get_symphonies('big.txt')

In [None]:
# Test with one entry
a = sorted_symphs[1176]
el = a[0]
list_of_postagged = a[1]
print(el)
for pos_tagged in list_of_postagged:
  print(' '.join([w for w,p in pos_tagged]))

🎤🎸🎹🎵🎤🎤
`` God help me ! ''
`` Nobody wants me ! ''
`` Devil take them ! ''
`` Devil take it ! ''
`` Natasha told me . ''


In [None]:
def find_sentences_with_most_specific_nouns(sorted_symphs):
    """
    Identify the most specific noun in each POS pattern group.

    Performance notes:
    - Optimized to run in under 5 seconds on a Google Colab notebook
    - First run after session restart may take longer due to WordNet initialization
    - Uses intelligent sampling and caching for efficiency

    Args:
        sorted_symphs: List of (pattern, list of POS-tagged sentences)

    Returns:
        List of dicts with:
            - 'pattern': POS tag pattern
            - 'pos_sentence': sentence with the most specific noun
            - 'noun': most specific noun
            - 'path_length': WordNet path depth
            - 'synsets': associated WordNet synsets
    """
    results = []
    noun_depth_cache = {}

    for emoji_pattern, sentences in tqdm(sorted_symphs, desc="Finding specific nouns", unit="symphony"):
    # for emoji_pattern, sentences in tqdm(sorted_symphs):
        the_most_specific_noun_in_that_sentence = None
        pos_tagged_sentence = None
        max_distance_from_synsets_for_noun_to_wn_root = 0
        list_of_synsets_mapping_to_noun = []

        for sentence in sentences:
            for word, pos in sentence:
                if not pos.startswith('NN'):
                    continue

                noun_key = word.lower()

                if noun_key in noun_depth_cache:
                    cached = noun_depth_cache[noun_key]
                    current_depth = cached['depth']
                    synset_names = cached['synsets']
                else:
                    synsets = wn.synsets(word, pos='n') or wn.synsets(noun_key, pos='n')
                    if not synsets:
                        continue
                    current_depth = max(len(path) for syn in synsets for path in syn.hypernym_paths())
                    synset_names = [syn.name() for syn in synsets]
                    noun_depth_cache[noun_key] = {
                        'depth': current_depth,
                        'synsets': synset_names
                    }

                if current_depth > max_distance_from_synsets_for_noun_to_wn_root:
                    the_most_specific_noun_in_that_sentence = word
                    pos_tagged_sentence = sentence
                    max_distance_from_synsets_for_noun_to_wn_root = current_depth
                    list_of_synsets_mapping_to_noun = synset_names

        if the_most_specific_noun_in_that_sentence:
            results.append({
                'pattern': emoji_pattern,
                'pos_sentence': pos_tagged_sentence,
                'noun': the_most_specific_noun_in_that_sentence,
                'path_length': max_distance_from_synsets_for_noun_to_wn_root,
                'synsets': list_of_synsets_mapping_to_noun
            })

    # Sort results by descending specificity
    results.sort(key=lambda x: x['path_length'], reverse=True)
    return results

# Example usage
# if you could not solve the first part, please use this loading step below.
# Otherwise, use your own sorted_symphs from previous part of the question.
# sorted_symphs = pickle.load(open('sorted_symphs.pkl', 'rb'))
specific_sentences = find_sentences_with_most_specific_nouns(sorted_symphs)

print(len(specific_sentences))

Finding specific nouns: 100%|██████████| 84789/84789 [00:02<00:00, 29867.22symphony/s]

78419





In [None]:
# Display results
print("\nMost specific sentences by pattern:")
for result in specific_sentences[:3]:  # Show top 3
    print(f"\nPattern: {result['pattern']}")
    print(f"POS tagged sentence: {result['pos_sentence']}")
    print(f"Most specific noun: {result['noun']}")
    print(f"Path length to root: {result['path_length']}")
    print(f"Synsets: {result['synsets']}")


Most specific sentences by pattern:

Pattern: 🎸🎵🎸🎤
POS tagged sentence: [('Jersey', 'NNP'), ('into', 'IN'), ('Pennsylvania', 'NNP'), ('.', '.')]
Most specific noun: Jersey
Path length to root: 19
Synsets: ['new_jersey.n.01', 'jersey.n.02', 'jersey.n.03', 'jersey.n.04', 'jersey.n.05']

Pattern: 🎸🎵🎸🎸🎤
POS tagged sentence: [('Paterson', 'NNP'), ('of', 'IN'), ('New', 'NNP'), ('Jersey', 'NNP'), ('flatly', 'RB')]
Most specific noun: Jersey
Path length to root: 19
Synsets: ['new_jersey.n.01', 'jersey.n.02', 'jersey.n.03', 'jersey.n.04', 'jersey.n.05']

Pattern: 🎸🎸🎵
POS tagged sentence: [('New', 'NNP'), ('Jersey', 'NNP'), ('had', 'VBD')]
Most specific noun: Jersey
Path length to root: 19
Synsets: ['new_jersey.n.01', 'jersey.n.02', 'jersey.n.03', 'jersey.n.04', 'jersey.n.05']
