In [1]:
import glob
import re
import joblib
import nepalitokenizer

In [2]:
categories = ['art' , 'economy' , 'foreign' , 'health' ,'interview' , 'opinion' , 'politics' ,
              'society' , 'sports' ,'technology' , 'world' ]

In [3]:
data = []


for i, filename in enumerate(categories):
    path = f'nagarik/{filename}/*.txt'
    files = glob.glob(path)
    for j , file in enumerate(files):
        with open(file  ,'r' , encoding='utf-8') as f:
            data.append((f.read() , filename))

In [4]:
import pandas as pd

df = pd.DataFrame(data , columns=['Text' , 'Label'])


In [5]:
df.head()

Unnamed: 0,Text,Label
0,मकवानपुर – फिल्म ‘हुर्रे’ले रिलिज मिति नजिकिएस...,art
1,काठमाडौं – मोक्षदा स्कुलको मञ्जरी थिएटरमा उड्क...,art
2,कठमाडौं – रसियाको नोभोसिबिर्सक सहरमा भइरहेको ‘...,art
3,मुम्बई – चर्चित अभिनेत्री रानी मुखर्जीले बलिउड...,art
4,मुम्बई – साउथ इण्डियनतर्फको मलयालम फिल्मको एकै...,art


In [6]:
df.shape

(4481, 2)

In [7]:
df.isnull().sum()

Text     0
Label    0
dtype: int64

In [8]:
df.Label.value_counts()

economy       510
opinion       507
foreign       498
society       494
politics      483
world         478
art           467
sports        413
technology    238
interview     201
health        192
Name: Label, dtype: int64

In [9]:
import spacy

In [12]:
X = df['Text']
y = df['Label']

In [13]:
X.shape , y.shape

((4481,), (4481,))

In [14]:
text = 'मकवानपुर – !?!फिल्म ‘?हुर्रे’!)!!'

In [15]:
tokenizer = nepalitokenizer.NepaliTokenizer(punct=['-' ,'‘'])

In [16]:
tokenizer.tokenizer(text)

['मकवानपुर', '–', 'फिल्म', 'हुर्रे’']

In [17]:


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [18]:
#now split the dataset

X_train , X_test , y_train ,y_test = train_test_split(X , y , test_size=0.2 , shuffle=True , random_state=42)

In [19]:
X_train.shape , X_test.shape

((3584,), (897,))

###  Using tfidfvectorizer

In [20]:
tfidf = TfidfVectorizer(tokenizer=tokenizer.tokenizer)

X_train  =tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 

log = LogisticRegression(multi_class="multinomial")
log.fit(X_train , y_train)
log.score(X_test , y_test)

0.8316610925306578

In [22]:
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

In [23]:
y_pred = log.predict(X_test)

In [24]:
print(accuracy_score(y_test , y_pred)) 

0.8316610925306578


In [25]:
classification_report(y_test , y_pred)

'              precision    recall  f1-score   support\n\n         art       0.89      0.93      0.91        98\n     economy       0.76      0.75      0.75        96\n     foreign       0.92      0.88      0.90       104\n      health       0.80      0.82      0.81        44\n   interview       0.89      0.50      0.64        32\n     opinion       0.82      0.91      0.86        95\n    politics       0.78      0.71      0.74        96\n     society       0.58      0.72      0.64        92\n      sports       0.97      0.97      0.97        91\n  technology       0.97      0.75      0.85        48\n       world       0.91      0.94      0.93       101\n\n    accuracy                           0.83       897\n   macro avg       0.85      0.81      0.82       897\nweighted avg       0.84      0.83      0.83       897\n'

In [26]:
confusion_matrix(y_test , y_pred)

array([[91,  0,  0,  1,  0,  2,  1,  2,  0,  1,  0],
       [ 0, 72,  1,  1,  1,  0,  4, 15,  1,  0,  1],
       [ 2,  3, 92,  0,  0,  1,  1,  1,  1,  0,  3],
       [ 0,  0,  0, 36,  0,  0,  3,  5,  0,  0,  0],
       [ 1,  0,  0,  0, 16, 12,  2,  1,  0,  0,  0],
       [ 1,  1,  1,  2,  0, 86,  0,  2,  1,  0,  1],
       [ 0,  8,  1,  0,  1,  3, 68, 15,  0,  0,  0],
       [ 4,  9,  3,  3,  0,  1,  6, 66,  0,  0,  0],
       [ 1,  0,  1,  0,  0,  0,  0,  1, 88,  0,  0],
       [ 1,  2,  1,  2,  0,  0,  0,  2,  0, 36,  4],
       [ 1,  0,  0,  0,  0,  0,  2,  3,  0,  0, 95]], dtype=int64)

In [28]:
#get user input

def get_user_input(text):
    
    text_train=  tfidf.transform([text])
    
    text_predict = log.predict(text_train)
    
    return text_predict

In [29]:
#works like a butter
get_user_input('धेरै गोल हानेर गेम जितियो')

array(['sports'], dtype=object)

In [30]:
get_user_input('सरकारले देशलाई गरीब बनाएर छोडे ')

array(['society'], dtype=object)

In [31]:
get_user_input('नयाँ मोबाइल ')

array(['technology'], dtype=object)

In [32]:
get_user_input('कोरोनाको महमारी')

array(['society'], dtype=object)

In [33]:
get_user_input('रुघा र खोकी  ')

array(['society'], dtype=object)

In [34]:
#save the model

joblib.dump(tfidf , './tfidfmodel.joblib')

['./tfidfmodel.joblib']

In [35]:
joblib.dump(log , './logmodel.joblib')

['./logmodel.joblib']