In [1]:
# import pandas
import pandas as pd

# load dataset
df = pd.read_csv('C:/Users/tarus/Downloads/all-data2.csv', encoding = "ISO-8859-1")

# preview of dataset
df.head()

Unnamed: 0,Sentiments,News Headlines
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [3]:
# import libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# remove missing values
df = df.dropna()

# encode target label
le = LabelEncoder()
df['Sentiments'] = le.fit_transform(df['Sentiments'])

# establish input and output
X = list(df['News Headlines'])
y = list(df['Sentiments'])

# split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# create a bag of words for only unigrams
cv = CountVectorizer(analyzer = 'word',ngram_range=(1,1), stop_words='english')

# convert training data to bag of words
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [6]:
# import libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
import numpy as np

# train naive bayes classifier
clf = MultinomialNB()
clf.fit(X_train_cv, y_train)

# create predictions
y_pred = clf.predict(X_test_cv)

# find f-1 score
score = f1_score(y_test, y_pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.7244


In [7]:
# create a bag of words for with unigrams and bigrams
cv = CountVectorizer(analyzer = 'word',ngram_range=(1,2), stop_words='english')

# convert training data to bag of words
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)


# train naive bayes classifier
clf = MultinomialNB()
clf.fit(X_train_cv, y_train)

# create predictions
y_pred = clf.predict(X_test_cv)

# find f-1 score
score = f1_score(y_test, y_pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.7302


In [8]:
# train model with different n-gram ranges
for N in range(1,11):
    
    # convert training data to bag of words
    cv = CountVectorizer(analyzer = 'word',ngram_range=(1,N), stop_words='english')
    X_train_cv = cv.fit_transform(X_train)
    X_test_cv = cv.transform(X_test)
    
    # train model and generate predictions
    clf = MultinomialNB()
    clf.fit(X_train_cv, y_train)
    y_pred = clf.predict(X_test_cv)
    
    # compute f-1 score
    score = np.round(f1_score(y_test, y_pred, average='micro'),4)
    print('F-1 score of model with n-gram range of {}: {}'.format((1,N), score))

F-1 score of model with n-gram range of (1, 1): 0.7244
F-1 score of model with n-gram range of (1, 2): 0.7302
F-1 score of model with n-gram range of (1, 3): 0.7261
F-1 score of model with n-gram range of (1, 4): 0.7285
F-1 score of model with n-gram range of (1, 5): 0.7277
F-1 score of model with n-gram range of (1, 6): 0.7285
F-1 score of model with n-gram range of (1, 7): 0.7277
F-1 score of model with n-gram range of (1, 8): 0.7277
F-1 score of model with n-gram range of (1, 9): 0.7269
F-1 score of model with n-gram range of (1, 10): 0.7269
