<a href="https://colab.research.google.com/github/yohan2001colombo/NLP/blob/main/nlp_day_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyphen


Collecting pyphen
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen
Successfully installed pyphen-0.17.2


In [None]:
import nltk
import epitran
import panphon
import matplotlib
import pyphen




## 1.Phonetic Transcription Using Pronouncing

In [None]:
import pronouncing

words = ["phonetics","phonology","morphology","analysis","transcription"]
transcriptions = {}

for word in words:
    transcription = pronouncing.phones_for_word(word)
    transcriptions[word] = transcription[0] if transcription else "No transcrption found"
print(transcriptions)



{'phonetics': 'F AH0 N EH1 T IH0 K S', 'phonology': 'F AH0 N AA1 L AH0 JH IY2', 'morphology': 'M AO0 R F AA1 L AH0 JH IY0', 'analysis': 'AH0 N AE1 L AH0 S AH0 S', 'transcription': 'T R AE2 N S K R IH1 P SH AH0 N'}


## 2. Phonological Feature Extraction with Panphon

In [None]:
import panphon

# define ARPAbet to IPA mapping (extend as needed)
arpabet_to_ipa = arpabet_to_ipa = {
    'P': 'p', 'B': 'b', 'K': 'k', 'D': 'd', 'G': 'g',
    'T': 't', 'DH': 'ð', 'N': 'n', 'M': 'm', 'NG': 'ŋ',
    'S': 's', 'Z': 'z', 'SH': 'ʃ', 'ZH': 'ʒ', 'CH': 'ʧ',
    'JH': 'ʤ', 'Y': 'j', 'W': 'w', 'R': 'ɹ', 'L': 'l',
    'Y': 'j', 'IY': 'iː', 'IH': 'ɪ', 'EH': 'ɛ', 'AE': 'æ',
    'AA': 'ɑː', 'AO': 'ɔː', 'UH': 'ʊ', 'UW': 'uː', 'ER': 'ɜː',
    'AH': 'ə', 'AW': 'aʊ', 'OY': 'ɔɪ', 'OW': 'oʊ', 'OY': 'ɔɪ',
    'AY': 'aɪ', 'EH0': 'ə', 'IH0': 'ɪ', 'AH0': 'ə', 'UH0': 'ʊ',
    'AW0': 'aʊ', 'OY0': 'ɔɪ', 'OW0': 'oʊ', 'AY0': 'aɪ'
}
# This expanded dictionary includes a broader range of ARPAbet symbols and their IPA equivalents. You can further extend this mapping based on your specific requirements.

#Functional to convert ARPAbet to IPA
def arpabet_to_ipa_converter(arpabet):
    return ''.join([arpabet_to_ipa.get(ph.strip('0123456789'),'') for ph in arpabet.split()]) # The function strips these markers, potentially losing stress information.

# Example transcription dictionary
transcriptions = {'phonetics': 'F AH0 N EH1 T IH0 K S', 'phonology': 'F AH0 N AA1 L AH0 JH IY2', 'morphology': 'M AO0 R F AA1 L AH0 JH IY0', 'analysis': 'AH0 N AE1 L AH0 S AH0 S', 'transcription': 'T R AE2 N S K R IH1 P SH AH0 N'}

ft = panphon.FeatureTable()

# Generate phonological features for each word
features = {
    word : ft.word_fts(arpabet_to_ipa_converter(trans)) for word,trans in transcriptions.items()
    if trans != "No transcription found"

}
# After converting ARPAbet transcriptions to IPA, the code uses PanPhon's FeatureTable class to analyze the phonological features of each word. The word_fts method returns a list of Segment objects, each representing a phoneme with its articulatory features

# print the features
print(features)

# Each Segment object contains a list of features such as syl (syllabic), son (sonorant), cont (continuant), and so on, with values indicating their presence (+), absence (-), or neutrality (0).


# These features can be utilized for various linguistic analyses, including phoneme classification, language comparison, and phonological pattern recognition

{'phonetics': [<Segment [+syl, +son, -cons, +cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, 0ant, -cor, 0distr, -lab, -hi, -lo, +back, -round, -velaric, -tense, -long, 0hitone, 0hireg]>, <Segment [-syl, +son, +cons, -cont, -delrel, -lat, +nas, -strid, +voi, -sg, -cg, +ant, +cor, -distr, -lab, -hi, -lo, -back, -round, -velaric, 0tense, -long, 0hitone, 0hireg]>, <Segment [+syl, +son, -cons, +cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, 0ant, -cor, 0distr, -lab, -hi, -lo, -back, -round, -velaric, -tense, -long, 0hitone, 0hireg]>, <Segment [-syl, -son, +cons, -cont, -delrel, -lat, -nas, -strid, -voi, -sg, -cg, +ant, +cor, -distr, -lab, -hi, -lo, -back, -round, -velaric, 0tense, -long, 0hitone, 0hireg]>, <Segment [+syl, +son, -cons, +cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, 0ant, -cor, 0distr, -lab, +hi, -lo, -back, -round, -velaric, -tense, -long, 0hitone, 0hireg]>, <Segment [-syl, -son, +cons, -cont, -delrel, -lat, -nas, -strid, -voi, -sg, -cg, -ant, -cor, 0distr, -lab

## Phonome Frequency Analysis

In [None]:
from collections import Counter

#split all transcriptions into phonemes(by space),then
all_phonemes = "".join(transcriptions.values()).split()
phoneme_counts = Counter(all_phonemes)

print(phoneme_counts)

Counter({'AH0': 7, 'N': 5, 'L': 3, 'R': 3, 'F': 2, 'K': 2, 'AA1': 2, 'JH': 2, 'S': 2, 'EH1': 1, 'T': 1, 'IH0': 1, 'SF': 1, 'IY2M': 1, 'AO0': 1, 'IY0AH0': 1, 'AE1': 1, 'ST': 1, 'AE2': 1, 'IH1': 1, 'P': 1, 'SH': 1})


In [None]:
"".join(transcriptions.values()).split();

## Stress Pattern Analysis with CMU Dictionary

In [None]:
nltk.download('cmudict')

[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


True

In [None]:
from nltk.corpus import cmudict

# Initialize the CMU Pronouncing Dictionary
d = cmudict.dict()

# Make sure 'words' is defined
words = ['phonetics','phonology','morphology','analysis','transcription']

stress_patterns = {}

for word in words:
  if word in d:
    trans = d[word][0]
    pattern = ''.join(['1' if '1' in ph else '0' for ph in trans])
    stress_patterns[word] = pattern
  else:
    stress_patterns[word] = "No found"

print(stress_patterns)



{'phonetics': '00010000', 'phonology': '00010000', 'morphology': '000010000', 'analysis': '00100000', 'transcription': '000000010000'}


In [None]:
d["phonetics"][0]

['F', 'AH0', 'N', 'EH1', 'T', 'IH0', 'K', 'S']

## 5. Vowel and Consonants Analysis

In [None]:
from collections import Counter

# Define ARPAbet vowel phonemes
arpabwt_vowels = {"AA", "AE", "AH", "AO", "AW", "AY", "EH", "ER", "EY", "IH","IY", "OW", "OY", "UH", "UW"}

#combine all the transcriptions and split into phonemes
phonemes = " ".join(transcriptions.values()).split()

# Seperate vowels and consonants (strip stress digits)
vowels = [ph for ph in phonemes if ph.strip('0123456789') in arpabwt_vowels]
consonants = [ph for ph in phonemes if ph.strip('0123456789') not in arpabwt_vowels]

# Count
vowel_count = Counter(vowels)
consonant_count = Counter(consonants)

print("Vowels:",vowel_count)
print("Consonants:",consonant_count)




Vowels: Counter({'AH0': 8, 'AA1': 2, 'EH1': 1, 'IH0': 1, 'IY2': 1, 'AO0': 1, 'IY0': 1, 'AE1': 1, 'AE2': 1, 'IH1': 1})
Consonants: Counter({'N': 5, 'S': 4, 'F': 3, 'L': 3, 'R': 3, 'T': 2, 'K': 2, 'JH': 2, 'M': 1, 'P': 1, 'SH': 1})


['AH0',
 'EH1',
 'IH0',
 'AH0',
 'AA1',
 'AH0',
 'IY2',
 'AO0',
 'AA1',
 'AH0',
 'IY0',
 'AH0',
 'AE1',
 'AH0',
 'AH0',
 'AE2',
 'IH1',
 'AH0']

In [None]:
import nltk

# Download the punkt tokenizer data
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize

# sample text
text = "Hello! This is a quick test"

# Tokenize
tokens = word_tokenize(text)

print(tokens)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['Hello', '!', 'This', 'is', 'a', 'quick', 'test']


In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necesary data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# sample text
text = "The striped bats are hanging on their feet for best"

# Tokenize text into words
words = word_tokenize(text)


# Remove Engilsh stopwords
filtered = [w for w in words if w not in stopwords.words('english')]

# Initialie stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Apply stemming and lemmatiation
stemmed = [stemmer.stem(word) for word in words]
lemmatized = [lemmatizer.lemmatize(word) for word in words]

print("filtered words", filtered)
print("Lemmatized:",lemmatized)
print("Stemmed_words:",stemmed)


filtered words ['The', 'striped', 'bats', 'hanging', 'feet', 'best']
Lemmatized: ['The', 'striped', 'bat', 'are', 'hanging', 'on', 'their', 'foot', 'for', 'best']
Stemmed_words: ['the', 'stripe', 'bat', 'are', 'hang', 'on', 'their', 'feet', 'for', 'best']


## 7.3 Morphological Complexity and Affis Analysis

In [None]:
from collections import Counter

lengths = [len(w) for w in filtered]
average_length = sum(lengths)/len(lengths) if lengths else 0
print("Average word length:",average_length)

prefixes = Counter([w[:3] for w in filtered if len(w)<3])


Average word length: 4.833333333333333


## 7.4 Syllable and Compound Word Analysis

In [None]:
import pyphen

dic = pyphen.Pyphen(lang='en')
syllables = [len(dic.inserted(w).split('-')) for w in filtered if w.strip() != '']

avg_syllabes = sum(syllables)/len(syllables) if syllables else 0
print("Avg syllables per word:",avg_syllabes)

Avg syllables per word: 1.0
