In [1]:
import numpy as np
import pandas as pd
import re
import string
import pickle

In [2]:
txt = 'great product. i like it'

In [3]:
def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
        return text

In [4]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

In [5]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [6]:
def preprocessing(text):
    data = pd.DataFrame([text], columns=['tweet'])
    data["tweet"] = data["tweet"].apply(lambda x: " ".join(x.lower() for x in x.split()))
    data["tweet"] = data["tweet"].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))
    data["tweet"] = data["tweet"].apply(remove_punctuation)
    data["tweet"] = data["tweet"].str.replace('\d+', '', regex=True)
    data["tweet"] = data["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
    data["tweet"] = data["tweet"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))
    return data['tweet']


In [7]:
preprocessing_txt = preprocessing(txt)

In [8]:
preprocessing_txt

0    great product. like
Name: tweet, dtype: object

In [9]:
vocab = pd.read_csv('../static/model/vocabulary.tex', header=None)
tokens = vocab[0].tolist()

In [10]:
tokens


['test',
 'android',
 'app',
 'beauti',
 'cute',
 'health',
 'iger',
 'iphoneonli',
 'iphonesia',
 'iphon',
 'final',
 'case',
 'thank',
 'yay',
 'soni',
 'xperia',
 'love',
 'would',
 'go',
 'talk',
 'relax',
 'smartphon',
 'wifi',
 'connect',
 'im',
 'know',
 'made',
 'way',
 'home',
 'amaz',
 'servic',
 'appl',
 'wont',
 'even',
 'question',
 'pay',
 'stupid',
 'support',
 'softwar',
 'updat',
 'fuck',
 'phone',
 'big',
 'time',
 'happi',
 'us',
 'instap',
 'instadaili',
 'xperiaz',
 'new',
 'type',
 'c',
 'charger',
 'cabl',
 'uk',
 '…',
 'amazon',
 'year',
 'newyear',
 'start',
 'technolog',
 'samsunggalaxi',
 'iphonex',
 'shop',
 'listen',
 'music',
 'likeforlik',
 'photo',
 'fun',
 'selfi',
 'water',
 'camera',
 'picoftheday',
 'sun',
 'instagood',
 'boy',
 'outdoor',
 'hey',
 'make',
 'ipod',
 'dont',
 'color',
 'inch',
 'crash',
 'everi',
 'need',
 'realli',
 'drop',
 'ball',
 'design',
 'give',
 'anoth',
 'crazi',
 'purchas',
 'lol',
 'work',
 'hard',
 'play',
 'ipad',
 'batt

In [11]:
def vectorizer(ds, vocabulary):
    vectorized_list = []

    for sentance in ds:
        sentance_list = np.zeros(len(vocabulary))

        for i in range(len(vocabulary)):
            if vocabulary[i] in sentance.split():
                sentance_list[i] =1

        vectorized_list.append(sentance_list)

    vectorized_list_new = np.asarray(vectorized_list, dtype= np.float32)

    return vectorized_list_new

In [12]:
vectorizer_txt = vectorizer(preprocessing_txt, tokens)

In [13]:
vectorizer_txt

array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [14]:
with open('../static/model/model.pickle', 'rb') as f:
    model = pickle.load(f)

In [15]:
model.predict(vectorizer_txt)

array([1])

In [16]:
def get_prediction(vectorizer_txt):
    prediction = model.predict(vectorizer_txt)
    if prediction == 1:
        return 'negative'
    else:
        return 'positive'

In [41]:
txt = "awsome product. i love it"
preprocessed_txt = preprocessing(txt)
vectorizer_txt = vectorizer(preprocessed_txt, tokens)
prediction = get_prediction(vectorizer_txt)
prediction

'positive'