# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Just run this in console in inspect

```
function ClickConnect(){
console.log("Working");
document
  .querySelector('#top-toolbar > colab-connect-button')
  .shadowRoot.querySelector('#connect')
  .click()
}
setInterval(ClickConnect,60000)
```



In [None]:
! pip -q install datasets transformers

[K     |████████████████████████████████| 452 kB 8.9 MB/s 
[K     |████████████████████████████████| 5.8 MB 54.2 MB/s 
[K     |████████████████████████████████| 132 kB 77.2 MB/s 
[K     |████████████████████████████████| 182 kB 69.0 MB/s 
[K     |████████████████████████████████| 212 kB 81.5 MB/s 
[K     |████████████████████████████████| 127 kB 75.3 MB/s 
[K     |████████████████████████████████| 7.6 MB 71.6 MB/s 
[?25h

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
import transformers
from datasets import DatasetDict, load_dataset, ClassLabel, Dataset
import random
import pandas as pd
from IPython.display import display, HTML
from pprint import pprint
from tqdm import tqdm

import time

print(transformers.__version__)

4.25.1


# Utils

In [None]:

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))
    return df


# Prepare Dataset

### local csv

In [None]:
my_codemixed_source = r'/content/drive/MyDrive/Homoglyphed code-mixed/Datsets/source/my_code_mixed_11k.csv'
IIITH_source = r'/content/drive/MyDrive/Homoglyphed code-mixed/Datsets/source/IIITH_Codemixed.csv'

datasets = load_dataset("csv", data_files=[my_codemixed_source,IIITH_source])

datasets = datasets.rename_column("content", "text")
datasets = datasets.remove_columns(['Unnamed: 0', 'label'])
datasets = datasets.filter(lambda example: len(example["text"].split())>3).shuffle(seed=42)
datasets['test'] = datasets['train']

# datasets = DatasetDict({
#     # 'train': Dataset.from_list(datasets['train']['content']),
#     'test': datasets['train'],
#     # 'valid': Dataset.from_dict(datasets['valid'][:test_n])
#     })



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-0cd5a21bec5308d3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-0cd5a21bec5308d3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

### reduce samples for testing locally

In [None]:
test_n = 8_000_000
datasets = DatasetDict({
    'train': Dataset.from_dict(datasets['train'][:test_n]),
    'test': Dataset.from_dict(datasets['test'][:test_n]),
    'valid': Dataset.from_dict(datasets['valid'][:test_n])
    })

### Check dataset

In [None]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 13531
    })
    test: Dataset({
        features: ['text'],
        num_rows: 13531
    })
})

In [None]:
random_tokens = show_random_elements(datasets["train"])

Unnamed: 0,text
0,Apka kya khna PM ji
1,Ye shayad sahi waqt hai kehne ka ki - neend nhi aa rhi
2,What s going on yaar ? Jaldi sort out karo face_with_rolling_eyes
3,Zindagi me Bc kisi Ke liye kitna bhi sacrifice karo sabko kam hi pad jata hain gharwale
4,Very shameful news year ki bachhi se rape
5,Bhai acha mazzak banaya aap ne aaj :-W
6,"#Sallu_bhai mere ko prsnly msg kr do ek baar... Bhai #kuchh_khas hai aap ke lye, whatsapp 8083503086"
7,kamaal karte ho Q intzar kra rhe ho
8,face_without_mouth i MISS you pooo All the Best for your final sem final Exam hop project s n codings are completed n don t affred while doing your presentation Trust your self You do BEST i kn tht heart_suit
9,Tu hi re MAKE WAY FOR TZH FIRST LOOK


# n-grams LM

https://www.kaggle.com/code/alvations/n-gram-language-model-with-nltk?kernelSessionId=17673021

https://www.nltk.org/api/nltk.lm.html

In [None]:
import nltk
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import padded_everygram_pipeline

from nltk import word_tokenize, sent_tokenize

nltk.download('punkt')

### Tokenization

In [None]:
dataset_keys = [
    'train',
    'test',
    # 'valid'
    ]

In [None]:
tokenized_dataset = {}

for key in dataset_keys:
  key_text = datasets[key]['text']
  tokenized_dataset[key] = []
  print(f'SPLIT : {key}')
  for text in tqdm(key_text):
    for sent in sent_tokenize(text):
      tokenized_dataset[key].append(list(map(str.lower, word_tokenize(sent))))


SPLIT : train


100%|██████████| 13531/13531 [00:02<00:00, 5083.54it/s]


SPLIT : test


100%|██████████| 13531/13531 [00:02<00:00, 5176.22it/s]


#### Padding and special tokens

In [None]:
# Preprocess the tokenized text for 3-grams language modelling
n = 3

tokenized_dataset_padded = {}

for key in dataset_keys:
  tokenized_dataset_padded[key] = {}
  tokenized_dataset_padded[key]['data'], tokenized_dataset_padded[key]['padded_sents'] = padded_everygram_pipeline(n, tokenized_dataset[key])


In [None]:
train_data, padded_sents = tokenized_dataset_padded['train']['data'], tokenized_dataset_padded['train']['padded_sents']

In [None]:
test_dataset = tokenized_dataset_padded['test']['data']
test_data = []
for sent in test_dataset:
  for pair in list(sent):
    test_data.append(pair)

test_data = list(set(test_data))

In [None]:
len(test_data)

358883

In [None]:
train_data

<generator object padded_everygram_pipeline.<locals>.<genexpr> at 0x7f45c1c98660>

### Training

In [None]:
from nltk.lm import Laplace

In [None]:
model = Laplace(n)
len(model.vocab)

0

In [None]:
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 27278 items>


In [None]:
print(len(model.vocab))
print(model.vocab.lookup(tokenized_dataset['train'][0]))

27278
('aap', 'ye', 'btaie', 'ki', 'aap', 'lucknow', 'kyu', 'nhi', 'ate', 'hai')


In [None]:
# If we lookup the vocab on unseen sentences not from the training data,
# it automatically replace words not in the vocabulary with `<UNK>`.
print(model.vocab.lookup('ye kya lah , shuru kar aliens .'.split()))

('ye', 'kya', 'lah', ',', 'shuru', 'kar', '<UNK>', '.')


In [None]:
# with Laplace smoothing at 200k real : PP = 199,500.9476269511
# with Laplace smoothing at 1M real : PP = 554,169.1948166972

model.perplexity(test_data)

15000.321546018504

In [None]:
test_data = tokenized_dataset_padded['test']['data']

# model.perplexity(test_data)
for test in test_data:
  # print(f"for {list(test)}")
  try:
    print(f"Perlexity: ", model.perplexity(test))
    print(f"for {list(test)}")
  except:
    print('SKIPPED')
    pass

### Using the trained model

In [None]:
print(model.counts)

<NgramCounter with 3 ngram orders and 788136 ngrams>


In [None]:
model.counts[['shuru', 'kar']]['diye'] # i.e. Count('diye'|'shuru kar')

0

In [None]:
model.score('diye', 'shuru kar'.split())  # P('diye'|'shuru kar')

3.665957914803138e-05

In [None]:
model.logscore('diye', 'shuru kar'.split())  # P('diye'|'shuru kar')

-14.735450250603018

### Generation using n-gram

In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [None]:
print(model.generate(20, random_seed=7))

['behen', 'argue', 'to', 'the', 'next', 'five', 'years', 'and', 'graduated', 'from', 'pakistan', 'as', 'mbbs', 'dr', 'clear', 'mci', 'exam', 'still', 'waiting', '3:30']


In [None]:
detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [None]:
generate_sent(model, 20, random_seed=7)

'behen argue to the next five years and graduated from pakistan as mbbs dr clear mci exam still waiting 3:30'

### Save the Model

In [None]:
import dill as pickle


dumping the model

In [None]:
with open('/content/drive/MyDrive/Homoglyphed code-mixed/n_grams/13k_3_ngram.pkl', 'wb') as fout:
    pickle.dump(model, fout)

### Loading the model

In [None]:
with open('/content/drive/MyDrive/Homoglyphed code-mixed/n_grams/13k_3_ngram.pkl', 'rb') as fin:
    model_loaded = pickle.load(fin)

In [None]:
generate_sent(model_loaded, 30, random_seed=41)

'cornflakes se goldflakes ho gayi kya bachha kya budha aaj desh mein reh kar desh ke pradhan mantri kab'

In [None]:
model_loaded.logscore('diye', 'shuru kar'.split())  # P('diye'|'shuru kar')

-14.735450250603018