# Setimental Analysis for Hindi

## Preparing tha data

In [1]:
import pandas as pd
import re

def clean(strng):
  strng = re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?\S', '', strng, flags=re.MULTILINE)
  strng = re.sub(r'[A-Za-z0-9]+', '', strng, flags=re.MULTILINE)
  strng = ''.join(re.sub("(@[A-Za-z0–9]+)|(#[A-Za-z0–9]+)|([@])|([#])|([\,])|([\.])|([\/])|([\\])|([\)])|([\(])|([-])|([\:])|([\?])|([_])|([!])","",strng))
  strng = ''.join(re.sub("(\')|(\")", "", strng))
  strng = strng.strip()
  return strng

df = pd.read_excel("data.ods", engine="odf")
for i in range(df['Text'].size):
    df['Text'][i] = clean(df['Text'][i])
df.head()

Unnamed: 0,Text,Label
0,लोग वतन तक खा जाते हैं इसका इसे यकीन नहींमान ज...,negative
1,गुमनाम है वतन पर मिटने वाले लोग आतन्कवादियों स...,negative
2,ज़ंजीर बदली जा रही थी मैं समझा था रिहाई हो गयी है,negative
3,यूपी में बड़े स्तर पर दंगे करवा सकती है बीजेपी...,negative
4,अंग्रेजी नहीं आती है इसलिए हिन्दी ट्विट ज्यादा...,negative


## Splitting tha data

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df['Text'], df['Label'], test_size=0.3, random_state=1)

## Initializing the vectorizers

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

hindi_stop_words = ["अंदर","अत","अदि","अप","अपना","अपनि","अपनी","अपने","अभि","अभी","आदि","आप","इंहिं","इंहें","इंहों","इतयादि","इत्यादि","इन","इनका","इन्हीं","इन्हें","इन्हों","इस","इसका","इसकि","इसकी","इसके","इसमें","इसि","इसी","इसे","उंहिं","उंहें","उंहों","उन","उनका","उनकि","उनकी","उनके","उनको","उन्हीं","उन्हें","उन्हों","उस","उसके","उसि","उसी","उसे","एक","एवं","एस","एसे","ऐसे","ओर","और","कइ","कई","कर","करता","करते","करना","करने","करें","कहते","कहा","का","काफि","काफ़ी","कि","किंहें","किंहों","कितना","किन्हें","किन्हों","किया","किर","किस","किसि","किसी","किसे","की","कुछ","कुल","के","को","कोइ","कोई","कोन","कोनसा","कौन","कौनसा","गया","घर","जब","जहाँ","जहां","जा","जिंहें","जिंहों","जितना","जिधर","जिन","जिन्हें","जिन्हों","जिस","जिसे","जीधर","जेसा","जेसे","जैसा","जैसे","जो","तक","तब","तरह","तिंहें","तिंहों","तिन","तिन्हें","तिन्हों","तिस","तिसे","तो","था","थि","थी","थे","दबारा","दवारा","दिया","दुसरा","दुसरे","दूसरे","दो","द्वारा","न","नहिं","नहीं","ना","निचे","निहायत","नीचे","ने","पर","पहले","पुरा","पूरा","पे","फिर","बनि","बनी","बहि","बही","बहुत","बाद","बाला","बिलकुल","भि","भितर","भी","भीतर","मगर","मानो","मे","में","यदि","यह","यहाँ","यहां","यहि","यही","या","यिह","ये","रखें","रवासा","रहा","रहे","ऱ्वासा","लिए","लिये","लेकिन","व","वगेरह","वरग","वर्ग","वह","वहाँ","वहां","वहिं","वहीं","वाले","वुह","वे","वग़ैरह","संग","सकता","सकते","सबसे","सभि","सभी","साथ","साबुत","साभ","सारा","से","सो","हि","ही","हुअ","हुआ","हुइ","हुई","हुए","हे","हें","है","हैं","हो","होता","होति","होती","होते","होना","होने"]

cv = CountVectorizer(lowercase=True,stop_words=hindi_stop_words,ngram_range = (1,1),tokenizer = lambda x: x.split())
tf=TfidfVectorizer(lowercase=True,stop_words=hindi_stop_words,ngram_range = (1,1),tokenizer = lambda x: x.split())

## Using Multinomial Naive Bayes Classifier 

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [5]:
#using count vaectorizer
X_traincv = cv.fit_transform(X_train)
clfcv = MultinomialNB().fit(X_traincv, y_train)
predictedcv= clfcv.predict(cv.transform(X_test))
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predictedcv))

MultinomialNB Accuracy: 0.7591776798825257


In [6]:
#using tfidf vectorizer
X_traintf = tf.fit_transform(X_train)
clftf = MultinomialNB().fit(X_traintf, y_train)
predictedtf= clftf.predict(tf.transform(X_test))
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predictedtf))

MultinomialNB Accuracy: 0.7529368575624082


In [7]:
clftf.predict(tf.transform([clean("पश्चिम बंगाल में शासन द्वारा ऐसी शिकायतें लगातार आने पर चिटफंड कंपनियों के ऊपर प्रतिबंध लगाना पड़ा है")]))

array(['negative'], dtype='<U8')

## Using Logistic Regression 

In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [9]:
#using count vectorizer
lr.fit(X_traincv, y_train)
predcv = lr.predict(cv.transform(X_test))
print("Logistic Accuracy:",metrics.accuracy_score(y_test, predcv))

Logistic Accuracy: 0.7635829662261381


In [10]:
#using tfidf vectorizer
lr.fit(X_traintf, y_train)
predtf = lr.predict(tf.transform(X_test))
print("Logistic Accuracy:",metrics.accuracy_score(y_test, predtf))

Logistic Accuracy: 0.7591776798825257


In [11]:
lr.predict(tf.transform([clean("पश्चिम बंगाल में शासन द्वारा ऐसी शिकायतें लगातार आने पर चिटफंड कंपनियों के ऊपर प्रतिबंध लगाना पड़ा है")]))

array(['negative'], dtype=object)

## Using Stochastic Gradient Descent (Linear SVM)

In [12]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()

In [13]:
#using count vectorizers
sgd.fit(X_traincv, y_train)
predsgdcv = sgd.predict(cv.transform(X_test))
print("SGD Accuracy:",metrics.accuracy_score(y_test, predsgdcv))

SGD Accuracy: 0.750367107195301


In [14]:
#using tfidf vectorizers
sgd.fit(X_traintf, y_train)
predsgdtf = sgd.predict(tf.transform(X_test))
print("SGD Accuracy:",metrics.accuracy_score(y_test, predsgdtf))

SGD Accuracy: 0.7650513950073421


In [16]:
sgd.predict(tf.transform([clean("पश्चिम बंगाल में शासन द्वारा ऐसी शिकायतें लगातार आने पर चिटफंड कंपनियों के ऊपर प्रतिबंध लगाना पड़ा है")]))

array(['negative'], dtype='<U8')