In [86]:

!pip install nltk

In [9]:
import re
import numpy as np
import nltk
from nltk.corpus import stopwords

In [10]:
nltk.download("punkt")
nltk.download("stopwords")
nepali_stopwords = set(stopwords.words("nepali"))

[nltk_data] Downloading package punkt to /home/yachana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yachana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def word_tokenizer(text):
    return text.split(' ')

def remove_special_char(text):
    text = re.sub('[।,?!]', "", text)
    return text

def remove_stopwords(text, stop_words=nepali_stopwords):
    filtered_text = [word for word in word_tokenizer(text) if word not in stop_words]
    return " ".join(filtered_text)

def Devnagari_Extractor(text):
    text = remove_special_char(text)
    text = remove_stopwords(text)
    tokens = word_tokenizer(text)

    devanagari_range = r'[\u0900-\u097F\\]'

    def is_devanagari(token):
        devanagari_chars = [char for char in token if re.match(devanagari_range, char)]
        return len(devanagari_chars) >= len(token) / 2

    return " ".join([token for token in tokens if is_devanagari(token)])

In [12]:
sample_text = "काठमाडौँभित्र 3%%% दिनभर पानी पर्ने र पोखरालगाएतमा बुटवलमा @ पानी नपर्ने Rajan 💪💪 😢 २४ १४११ १२३१२३२१३"

In [13]:
word_tokenizer(sample_text)

['काठमाडौँभित्र',
 '3%%%',
 'दिनभर',
 'पानी',
 'पर्ने',
 'र',
 'पोखरालगाएतमा',
 'बुटवलमा',
 '@',
 'पानी',
 'नपर्ने',
 'Rajan',
 '💪💪',
 '😢',
 '२४',
 '१४११',
 '१२३१२३२१३']

In [14]:
remove_special_char(sample_text)

'काठमाडौँभित्र 3%%% दिनभर पानी पर्ने र पोखरालगाएतमा बुटवलमा @ पानी नपर्ने Rajan 💪💪 😢 २४ १४११ १२३१२३२१३'

In [15]:
remove_stopwords(sample_text)

'काठमाडौँभित्र 3%%% दिनभर पानी पर्ने पोखरालगाएतमा बुटवलमा @ पानी नपर्ने Rajan 💪💪 😢 २४ १४११ १२३१२३२१३'

In [16]:
sample_text = Devnagari_Extractor(sample_text)
sample_text

'काठमाडौँभित्र दिनभर पानी पर्ने पोखरालगाएतमा बुटवलमा पानी नपर्ने २४ १४११ १२३१२३२१३'

In [17]:
suffix_path = "/home/yachana/Downloads/suffix.txt"
def get_suffix():
    # Create a dictionary based on the length of suffix
    with open(suffix_path, 'r') as suff_file:
        suffixes = {}
        for row in suff_file.read().splitlines():
            stem_len = len(list(row))
            if stem_len not in suffixes:
                suffixes[stem_len] = [row]
            else:
                suffixes[stem_len] += ([row])

    return suffixes

In [18]:
suffix = get_suffix()

In [19]:
def remove_suffix(word,suffix):
    for L in 9, 8, 7, 6, 5, 4, 3, 2:
        if len(word) > L + 1:
            for suf in suffix[L]:
                if word.endswith(suf):
                    ans = word[:-L]
                    return ans
    return word  # return the original word if no suffix is found

def process_sentence(sentence, suffix):
    words = sentence.split()
    processed_words = [remove_suffix(word, suffix) for word in words]
    processed_sentence = ' '.join(processed_words)
    return processed_sentence

In [20]:
process_sentence(sample_text,suffix)

'काठमाडौँ दिन पानी पर्ने पोखरा बुटवल पानी नपर्ने २४ १४११ १२३१२३२१३'

Vectorize text

In [21]:
# Building a vocabulary of nepali words
def build_vocab(data):
  vocab = []
  for sentence in data:
    # Cleaning
    cleaned_words = Devnagari_Extractor(sentence)
    # Tokenizing
    word_tokens = word_tokenizer(cleaned_words)
    vocab.extend(word_tokens)
  return list(set(vocab))

In [22]:
data = ["नमस्कार! के छ खबर?","रिता तपाईंको मित्र हो","रामले वात खायो।"]

In [23]:
vocab = build_vocab(data)

In [24]:
vocab

['खायो', 'रामले', 'वात', 'तपाईंको', 'मित्र', 'रिता', 'खबर', 'नमस्कार']

In [25]:
word2idx = {word:i for i,word in enumerate(vocab)}
idx2word = {i:word for word,i in word2idx.items()}

In [26]:
word2idx

{'खायो': 0,
 'रामले': 1,
 'वात': 2,
 'तपाईंको': 3,
 'मित्र': 4,
 'रिता': 5,
 'खबर': 6,
 'नमस्कार': 7}

In [27]:
idx2word

{0: 'खायो',
 1: 'रामले',
 2: 'वात',
 3: 'तपाईंको',
 4: 'मित्र',
 5: 'रिता',
 6: 'खबर',
 7: 'नमस्कार'}

In [28]:
def bag_of_words(sent):
  #Cleaning
  cleaned_words = Devnagari_Extractor(sent)
  #Tokenizing
  processed_texts = word_tokenizer(cleaned_words)
  vector = np.zeros(len(vocab))
  for word in processed_texts:
    vector[word2idx[word]] = 1
  return vector

In [29]:
bag_of_words(data[0])

array([0., 0., 0., 0., 0., 0., 1., 1.])

TF-IDF


In [30]:
# Counting the number of times each word appear in documents
def count_dict(sentences):
    count_dict = {}
    for word in vocab:
        count_dict[word] = 0
    for sent in sentences:
        #Cleaning the text
        cleaned_words = Devnagari_Extractor(sent)
        #Tokenizing the text
        processed_texts = word_tokenizer(cleaned_words)
        for word in processed_texts:
            count_dict[word] += 1
    return count_dict

In [31]:
word_count = count_dict(data)
print(word_count)

{'खायो': 1, 'रामले': 1, 'वात': 1, 'तपाईंको': 1, 'मित्र': 1, 'रिता': 1, 'खबर': 1, 'नमस्कार': 1}


In [32]:
def term_frequency(document, word):
    N = len(document)
    occurance = 0
    for token in document:
        if token == word:
            occurance += 1
    return occurance / N

In [33]:
def inverse_document_frequency(word):
    try:
        word_occurance = word_count[word] + 1
    except:
        word_occurance = 1
    return np.log(len(data) / word_occurance)

In [34]:
def tf_idf(sentence):
    vec = np.zeros((len(vocab),))
    cleaned_words = Devnagari_Extractor(sentence)
    #Tokenizing the text
    processed_texts = word_tokenizer(cleaned_words)
    for word in processed_texts:
        tf = term_frequency(processed_texts, word)
        idf = inverse_document_frequency(word)
        vec[word2idx[word]] = tf * idf
    return vec

In [35]:
data

['नमस्कार! के छ खबर?', 'रिता तपाईंको मित्र हो', 'रामले वात खायो।']

Similarity

In [63]:
from sklearn.metrics.pairwise import cosine_similarity
def cal_cosine_similarity(text1, text2):
    vec1 = tf_idf(text1)
    vec2 = tf_idf(text2)
    vec1 = np.asarray(vec1).reshape(-1,vec1.shape[0])
    vec2 =  np.asarray(vec2).reshape(-1,vec2.shape[0])
    print(cosine_similarity(vec1, vec2))
    return cosine_similarity(vec1, vec1)[0]

In [64]:
cal_cosine_similarity(data[0],data[1])

[[0.]]


array([1.])

In [37]:
import numpy as np

def one_hot(y):
    n_classes = np.unique(y)
    one_hot = np.zeros((len(y), len(n_classes)))
    for i, c in enumerate(y):
        one_hot[i, n_classes == c] = 1
    return one_hot

# Assuming you have four classes represented by numbers 0 to 3
y = np.array([0, 1, 2, 3, 0, 2, 1, 3, 0])

# Convert classes to one-hot encoding
one_hot_y = one_hot(y)

print(one_hot_y)


[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]]


In [1]:

import numpy as np

def softmax(z):
    exps = np.exp(z)
    sum_exps = np.sum(exps)
    return exps / sum_exps

z = np.array([1.0, 2.0, 3.0])
probabilities = softmax(z)

print(probabilities)


[0.09003057 0.24472847 0.66524096]


## Logestic Regression Breakdown

In [2]:
import numpy as np


class LogisticRegression:
    def __init__(self, lr=0.00001, n_iter=1000):
        self.lr = lr
        self.n_iter = n_iter
        self.w = None

    @staticmethod
    def one_hot(y):
        n_classes = np.unique(y)
        one_hot = np.zeros((len(y), len(n_classes)))
        for i, c in enumerate(y):
            one_hot[i, n_classes == c] = 1
        return one_hot

    def probabilities(self, X):
        z = np.dot(X, self.w.T)
        return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)

    def predict(self, X):
        X = np.insert(X, 0, 1, axis=1)
        return np.argmax(self.probabilities(X), axis=1)

    def accuracy(self, X, y):
        return np.mean(self.predict(X) == y)

    def fit(self, X, y):
        X = np.insert(X, 0, 1, axis=1) # (samples,dim) -> (samples, dim + 1)
        self.w = np.zeros((len(np.unique(y)), X.shape[1])) # (4, 301)
        y = self.one_hot(y)

        for _ in range(self.n_iter):
            predictions = self.probabilities(X)
            error = predictions - y
            gradient = np.dot(error.T, X)
            self.w -= self.lr * gradient
            

In [3]:
# Generating random data
np.random.seed(0) # For consistent results
X = np.random.rand(100, 300) # 100 samples, 300 features each
y = np.random.choice([0, 1, 2, 3], 100) # 100 targets belonging to one of four classes


# Creating an instance of the Logistic Regression class
model = LogisticRegression(lr=0.00001, n_iter=1)


In [4]:
# Adding bias to the feature matrix
X = np.insert(X, 0, 1, axis=1)
print(f'Shape of X after adding bias: {X.shape}')  # (100, 301)

# Initializing weights
model.w = np.zeros((len(np.unique(y)), X.shape[1]))
print(f'Initial shape of weights: {model.w.shape}')  # (4, 301)

# One-hot encoding the target variable
y = model.one_hot(y)
print(f'Shape of y after one-hot encoding: {y.shape}')  # (100, 4)

for _ in range(model.n_iter):
    # Compute probabilities
    predictions = model.probabilities(X)
    print(f'Shape of predictions: {predictions.shape}')  # (100, 4)

    # Compute error
    error = predictions - y
    print(f'Shape of error: {error.shape}')  # (100, 4)

    # Compute gradient
    gradient = np.dot(error.T, X)
    print(f'Shape of gradient: {gradient.shape}')  # (4, 301)

    # Update weights
    model.w -= model.lr * gradient
    print(f'Shape of weights after update: {model.w.shape}')  # (4, 301)


Shape of X after adding bias: (100, 301)
Initial shape of weights: (4, 301)
Shape of y after one-hot encoding: (100, 4)
Shape of predictions: (100, 4)
Shape of error: (100, 4)
Shape of gradient: (4, 301)
Shape of weights after update: (4, 301)


In [None]:
import pandas as pd

# Sample data
data = {
    'Category': ['Fruit', 'Vegetable', 'Fruit', 'Vegetable', 'Fruit'],
    'Item': ['Apple', 'Carrot', 'Banana', 'Broccoli', 'Apple'],
    'Sales': [100, 200, 150, 300, 50]
}

# Create DataFrame
df = pd.DataFrame(data)

# Group by the 'Category' column and sum the 'Sales' column
grouped_data = df.groupby('Category')['Sales'].sum().reset_index()

print(grouped_data)