In [1]:
import pandas as pd

#  load dataset
df = pd.read_csv('all-data.csv',names=['labels','messages'],encoding='ISO-8859-1')
df.head()

Unnamed: 0,labels,messages
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [2]:
import nltk
nltk.download()


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
#  Data cleaning and preprocessing

import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()
sn = SnowballStemmer(language='english')
corpus = []

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]',' ',df.messages[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in review if w not in set(stopwords.words('english'))]
    review = [sn.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)



In [4]:
novidata = df.copy()

In [5]:
#  Creating the Bag of Words model

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()

#  Creating the TF-IDF model

from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer()
X1 = cv.fit_transform(corpus).toarray()

In [6]:
df['labels'] = [0 if labels == "negative"
             else 1 if labels == "neutral"
             else 2 for labels in df["labels"]]

novidata['labels'] = [0 if labels == "negative"
             else 1 if labels == "neutral"
             else 2 for labels in novidata["labels"]]

In [7]:
df

Unnamed: 0,labels,messages
0,1,"According to Gran , the company has no plans t..."
1,1,Technopolis plans to develop in stages an area...
2,0,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company 's updated strategy f...
...,...,...
4841,0,LONDON MarketWatch -- Share prices ended lower...
4842,1,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,0,Operating profit fell to EUR 35.4 mn from EUR ...
4844,0,Net sales of the Paper segment decreased to EU...


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score, f1_score

y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.20, random_state = 0) #  try with X and X1

#  Training models using Naive bayes classifier, Logistic Regression and SVC

models = {
    MultinomialNB():'Multinomial Naive Bayes',
    LogisticRegression(max_iter=300):'Logistic Regression',
    SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto'):'Support Vector Machine'
}

for m in models.keys():
    m.fit(X_train,y_train)
    
for model,name in models.items():
     print(f"Accuracy Score for {name} is : ", model.score(X_test,y_test)*100, "%")


Accuracy Score for Multinomial Naive Bayes is :  67.7319587628866 %
Accuracy Score for Logistic Regression is :  74.63917525773196 %
Accuracy Score for Support Vector Machine is :  75.36082474226804 %


In [9]:
from gensim.models import Word2Vec

corpus = [nltk.word_tokenize(corpus) for corpus in corpus]

for i in range(len(corpus)):
    corpus[i] = [word for word in corpus[i] if word not in stopwords.words('english')]
    
#  Creating model using Word2Vec
model = Word2Vec(corpus, window = 5, min_count = 1, sg = 0, negative = 1)

#  Get 3 most similar words 
similar = model.wv.most_similar(positive = ['industri'], topn = 3)
print(similar)





[('servic', 0.9678139686584473), ('provid', 0.9659014940261841), ('develop', 0.9657852649688721)]


In [10]:
similar = model.wv.most_similar(positive=['develop'], topn = 3)
print(similar)

[('technolog', 0.9785764217376709), ('servic', 0.9765380620956421), ('solut', 0.9759657382965088)]


In [11]:
import numpy as np

novi2 = []
novi1 = []

#  Embeddings
for j in range(0,len(corpus)):
    for i in range(0,len(corpus[j])):
        novi = (model.wv[corpus[j][i]])
        novi1.append(novi)
    if(len(novi1)>0):
        novi2.append(np.average(novi1, axis = 0))
    else:
        novidata = novidata.drop(novidata.index[j])
    novi1=[]
   


In [12]:
#  Training models using Naive bayes classifier, Logistic Regression, SVC and RFC with Word2Vec

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

y1 = novidata['labels']

X_train1, X_test1, y_train1, y_test1 = train_test_split(novi2, y1, test_size = 0.20, random_state = 0)

#  Needs scaler for MNB (negative values in vectors)
scaler = MinMaxScaler()
X_train1 = scaler.fit_transform(X_train1)
X_test1 = scaler.transform(X_test1)

models1 = {
    MultinomialNB():'Multinomial Naive Bayes',
    RandomForestClassifier(n_estimators = 20, random_state = 0):'Random Forest Classifier',
    SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto'): 'Support Vector Machine',
    LogisticRegression(max_iter = 300):'Logistic Regression'
}

for m in models1.keys():
    m.fit(X_train1, y_train1)
    
for model, name in models1.items():
     print(f"Accuracy Score for {name} is : ", model.score(X_test1, y_test1)*100, "%")


Accuracy Score for Multinomial Naive Bayes is :  58.204334365325074 %
Accuracy Score for Random Forest Classifier is :  62.22910216718266 %
Accuracy Score for Support Vector Machine is :  64.18988648090816 %
Accuracy Score for Logistic Regression is :  65.3250773993808 %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
