## Read into Python

Let's first read the required data from CSV file using Pandas library.

In [None]:
import pandas as pd
import csv
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np                  #linear algebra
import pandas as pd                 # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt     #For Visualisation
import seaborn as sns               #For better Visualisation
from bs4 import BeautifulSoup       #For Text Parsing

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('main.csv')
data = data[['Dates','News','PriceSentiment']]
print(data.shape)
data.head(7)


Now, show the data how looks like…

In [None]:
data.isnull().sum()

In [None]:
data=data.dropna()
data.isnull().sum()

In [None]:
Sentiment = data['PriceSentiment'].unique()
print(Sentiment)

data.groupby(data['PriceSentiment']).News.count().plot.bar(ylim=0)
plt.show()


In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

stemmer = PorterStemmer()
words = stopwords.words("english")

data['processedtext'] = data['News'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())


In [None]:
print(data.shape)
data.head(10)

## Pre-process Data

We need to remove package name as it's not relevant. Then convert text to lowercase for CSV data. So, this is data pre-process stage.

In [None]:
def preprocess_data(data):
    # Remove package name as it's not relevant
    #data = data.drop('News', axis=1)
    
    # Convert text to lowercase
    data['processedtext'] = data['processedtext'].str.strip().str.lower()
    return data

In [None]:
data = preprocess_data(data)

## Splitting Data

First, separate the columns into dependent and independent variables (or features and label). Then you split those variables into train and test set.

In [None]:
df = data
# Split into training and testing data
x = data['News']
y = data['PriceSentiment']
x, x_test, y, y_test = train_test_split(x,y, stratify=y, test_size=0.3, random_state=42)


Vectorize text reviews to numbers.

In [None]:
# Vectorize text reviews to numbers
vec = CountVectorizer(stop_words='english')
x = vec.fit_transform(x).toarray()
x_test = vec.transform(x_test).toarray()

## Model Generation

After splitting and vectorize text reviews to number, we will generate a random forest model on the training set and perform prediction on test set features.

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x, y)

## Evaluating Model

After model generation, check the accuracy using actual and predicted values.

In [None]:
model.score(x_test, y_test)*100


Then check prediction…

In [None]:
from itertools import count
import pandas as pd
df = pd.read_csv('gold-dataset-sinha-khandait.csv', sep=',', header=None)
start = 10000
end = 10570
df = df[start - 1:end - 1]
correct = 0
for i in range(len(df)):
    print(df.values[i][2])
    print(model.predict(vec.transform([df.values[i][2]])),df.values[i][9] == model.predict(vec.transform([df.values[i][2]])))
    
    if df.values[i][9] == model.predict(vec.transform([df.values[i][2]])):
        correct += 1
        
print(correct / len(df) * 100 )        

In [None]:
model.predict(vec.transform(['Changes in non-farm payrolls increase.']))

Average hourly earnings, m/m, remain unchanged. รายได้เฉลี่ยต่อชั่วโมง m/m ยังคงไม่เปลี่ยนแปลง
The change in non-farm payrolls increased from the previous time. การเปลี่ยนแปลงในการจ้างงานนอกภาคเกษตรเพิ่มขึ้นจากครั้งก่อน
lower unemployment rate อัตราการว่างงานลดลง


In [None]:
import joblib
joblib.dump(model, 'model.pkl')

In [None]:
from localStoragePy import localStoragePy

def handle_click(e): 
    text =  Element("comment").value
    localStorage.setItem("comment", text)
    pyscript.write("output", "output")