In [128]:
import pandas as pd

#  load dataset
df = pd.read_csv('all-data.csv',names=['labels','messages'],encoding='ISO-8859-1')
df.head()

Unnamed: 0,labels,messages
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [129]:
import nltk
nltk.download()


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [130]:
#  Data cleaning and preprocessing

import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()
sn = SnowballStemmer(language='english')
corpus = []

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]',' ',df.messages[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in review if w not in set(stopwords.words('english'))]
    review = [sn.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
   # review = ' '.[sn.stem(lemmatizer.lemmatize(w, get_wordnet_pos(w))) for w in review.lower().split() if w not in set(stopwords.words('english'))]
    corpus.append(review)



In [131]:
new_data = df.copy()

In [177]:
df.messages

0       According to Gran , the company has no plans t...
1       Technopolis plans to develop in stages an area...
2       The international electronic industry company ...
3       With the new production plant the company woul...
4       According to the company 's updated strategy f...
                              ...                        
4841    LONDON MarketWatch -- Share prices ended lower...
4842    Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843    Operating profit fell to EUR 35.4 mn from EUR ...
4844    Net sales of the Paper segment decreased to EU...
4845    Sales in Finland decreased by 10.5 % in Januar...
Name: messages, Length: 4846, dtype: object

In [132]:
print(get_wordnet_pos('love'))
print(get_wordnet_pos('lovely'))
print(get_wordnet_pos('loving'))

n
r
v


In [133]:
corpus[0:2]

['accord gran compani plan move product russia although compani grow',
 'technopoli plan develop stage area less squar meter order host compani work comput technolog telecommun statement say']

In [158]:
#  Creating the Bag of Words model

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X = cv.fit_transform(corpus[0:2]).toarray()

#  Creating the TF-IDF model

from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer()
X1 = cv.fit_transform(corpus[0:2]).toarray()

In [159]:
print(cv.get_feature_names_out())
print(X1)

['accord' 'although' 'area' 'compani' 'comput' 'develop' 'gran' 'grow'
 'host' 'less' 'meter' 'move' 'order' 'plan' 'product' 'russia' 'say'
 'squar' 'stage' 'statement' 'technolog' 'technopoli' 'telecommun' 'work']
[[0.32391104 0.32391104 0.         0.46093075 0.         0.
  0.32391104 0.32391104 0.         0.         0.         0.32391104
  0.         0.23046538 0.32391104 0.32391104 0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.2499025  0.17780768 0.2499025  0.2499025
  0.         0.         0.2499025  0.2499025  0.2499025  0.
  0.2499025  0.17780768 0.         0.         0.2499025  0.2499025
  0.2499025  0.2499025  0.2499025  0.2499025  0.2499025  0.2499025 ]]


In [135]:
df['labels'] = [0 if labels == "negative"
             else 1 if labels == "neutral"
             else 2 for labels in df["labels"]]

new_data['labels'] = [0 if labels == "negative"
             else 1 if labels == "neutral"
             else 2 for labels in new_data["labels"]]

In [136]:
df

Unnamed: 0,labels,messages
0,1,"According to Gran , the company has no plans t..."
1,1,Technopolis plans to develop in stages an area...
2,0,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company 's updated strategy f...
...,...,...
4841,0,LONDON MarketWatch -- Share prices ended lower...
4842,1,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,0,Operating profit fell to EUR 35.4 mn from EUR ...
4844,0,Net sales of the Paper segment decreased to EU...


In [139]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score, f1_score

y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.20, random_state = 0) #  try with X and X1

#  Model training using Naive bayes classifier, Logistic Regression and SVC

models = {
    MultinomialNB():'Multinomial Naive Bayes',
    LogisticRegression(max_iter=300):'Logistic Regression',
    SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto'):'Support Vector Machine'
}

for m in models.keys():
    m.fit(X_train,y_train)
    
for model,name in models.items():
     print(f"Accuracy Score for {name} is : ", model.score(X_test,y_test)*100, "%")


Accuracy Score for Multinomial Naive Bayes is :  67.7319587628866 %
Accuracy Score for Logistic Regression is :  74.63917525773196 %
Accuracy Score for Support Vector Machine is :  75.36082474226804 %


In [140]:
from gensim.models import Word2Vec

list_of_lists = [nltk.word_tokenize(corpus) for corpus in corpus]

    
#  Creating model using Word2Vec

model = Word2Vec(list_of_lists, window = 5, min_count = 1, sg = 0, hs = 1)

#  Get 3 most similar words 
#similar = model.wv.most_similar(positive = ['industri'], topn = 3)
#print(similar)





In [141]:
similar = model.wv.most_similar('compani', topn = 3)
print(similar)

[('maker', 0.9574943780899048), ('aspocomp', 0.9552718997001648), ('rautaruukki', 0.9550167918205261)]


In [160]:
model.wv.similarity('russia', 'meter')

0.6635804

In [None]:
list_of_lists = [nltk.word_tokenize(corpus) for corpus in corpus]

In [98]:
list_of_lists[0:2]

[['accord',
  'gran',
  'compani',
  'plan',
  'move',
  'product',
  'russia',
  'although',
  'compani',
  'grow'],
 ['technopoli',
  'plan',
  'develop',
  'stage',
  'area',
  'less',
  'squar',
  'meter',
  'order',
  'host',
  'compani',
  'work',
  'comput',
  'technolog',
  'telecommun',
  'statement',
  'say']]

In [29]:
similar = model.wv.most_similar(positive=['develop'], topn = 3)
print(similar)

[('solut', 0.9810017347335815), ('provid', 0.9808456897735596), ('servic', 0.9806942939758301)]


In [120]:
model.wv['compani']

array([-2.36035865e-02,  5.80423959e-02, -7.27785751e-02,  1.32263377e-02,
       -3.38913649e-02, -1.02723492e-02, -4.37395163e-02, -2.98373476e-02,
       -1.39214536e-02, -3.56604978e-02, -3.56875695e-02, -1.57713890e-02,
        2.73840278e-02, -1.07569229e-02,  1.64680686e-02, -3.34480144e-02,
       -3.31588499e-02,  5.95135130e-02, -2.26852037e-02,  1.08586363e-01,
       -8.44427496e-02,  9.23120752e-02,  2.28032265e-02,  9.54171177e-03,
        9.62206945e-02, -7.79979080e-02,  3.09958551e-02,  1.00962013e-01,
       -2.19753035e-03, -2.49636844e-02,  9.02668536e-02, -1.75293721e-02,
        4.40836558e-03,  5.49590252e-02,  5.03864847e-02,  9.44275334e-02,
       -2.83048925e-04, -6.04072912e-03, -5.90352388e-03,  4.33618948e-03,
        2.83837896e-02,  4.06741463e-02,  5.88304922e-03, -7.64686838e-02,
       -3.09005566e-02, -1.11655938e-02, -8.45301077e-02, -1.38409454e-02,
       -5.40145338e-02, -3.74493864e-03, -1.85833555e-02,  4.04233811e-03,
        1.52278580e-02,  

In [175]:
import numpy as np

#  Embeddings

embeddings = []

for j in range(0,len(list_of_lists)):
    temp = []
    for i in range(0, len(list_of_lists[j])):
        temp.append(model.wv[list_of_lists[j][i]])
    if(len(temp)>0):
        embeddings.append(np.average(temp, axis = 0))
   


In [176]:
#  Model training using Naive bayes classifier, Logistic Regression, SVC and RFC with Word2Vec

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

y1 = novidata['labels']

X_train1, X_test1, y_train1, y_test1 = train_test_split(embeddings, y1, test_size = 0.20, random_state = 0)

#  Needs scaler for MNB (negative values in vectors)
scaler = MinMaxScaler()
X_train1 = scaler.fit_transform(X_train1)
X_test1 = scaler.transform(X_test1)

models1 = {
    MultinomialNB():'Multinomial Naive Bayes',
    RandomForestClassifier(n_estimators = 20, random_state = 0):'Random Forest Classifier',
    SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto'): 'Support Vector Machine',
    LogisticRegression(max_iter = 300):'Logistic Regression'
}

for m in models1.keys():
    m.fit(X_train1, y_train1)
    
for model, name in models1.items():
     print(f"Accuracy Score for {name} is : ", model.score(X_test1, y_test1)*100, "%")


Accuracy Score for Multinomial Naive Bayes is :  57.06914344685242 %
Accuracy Score for Random Forest Classifier is :  62.848297213622295 %
Accuracy Score for Support Vector Machine is :  64.49948400412796 %
Accuracy Score for Logistic Regression is :  64.49948400412796 %


In [174]:
len(embeddings)

0

In [None]:
!pip install sentence-transformers

In [32]:
from sentence_transformers import SentenceTransformer

model_b = SentenceTransformer('bert-base-nli-mean-tokens')

model_b

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
pip install -U sentence-transformers

In [40]:
embeddings = model_b.encode(corpus)


In [44]:
#  Model training using Naive bayes classifier, Logistic Regression, SVC and RFC with BERT

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier


X_train1, X_test1, y_train1, y_test1 = train_test_split(embeddings, y, test_size = 0.20, random_state = 0)

#  Needs scaler for MNB (negative values in vectors)
scaler = MinMaxScaler()
X_train1 = scaler.fit_transform(X_train1)
X_test1 = scaler.transform(X_test1)

models_b = {
    MultinomialNB():'Multinomial Naive Bayes',
    RandomForestClassifier(n_estimators = 20, random_state = 0):'Random Forest Classifier',
    SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto'): 'Support Vector Machine',
   # LogisticRegression(max_iter = 300):'Logistic Regression'
}

for m in models_b.keys():
    m.fit(X_train1, y_train1)
    
for model, name in models_b.items():
     print(f"Accuracy Score for {name} is : ", model.score(X_test1, y_test1)*100, "%")


Accuracy Score for Multinomial Naive Bayes is :  64.3298969072165 %
Accuracy Score for Random Forest Classifier is :  67.62886597938144 %
Accuracy Score for Support Vector Machine is :  73.19587628865979 %
