In [22]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier

In [23]:
import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        return s

In [31]:
punc_list = string.punctuation

In [24]:
#preproccessing
def tokenize_text(list_of_strings):
    list_of_lists = list()
    for string2 in list_of_strings:
        string2 = RegexpReplacer().replace(string2.lower())
        #punctuation
        string2 = ''.join(s for s in string2 if s not in punc_list)
        tokenized = word_tokenize(string2)
        POS = pos_tag(tokenized)
        word_list = [WordNetLemmatizer().lemmatize(tokenized[i], get_wordnet_pos(POS[i][1])) 
            for i in range(len(tokenized))]
        list_of_lists.append(word_list)
    return list_of_lists

def get_wordnet_pos(treebank_tag):
        #maps pos tag so lemmatizer understands
        from nltk.corpus import wordnet
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

In [25]:
# feature extraction and vectorization
def build_feature_matrices(list_of_lists):
    vectorizer = CountVectorizer(analyzer=lambda x: x, max_features=1000, lowercase=False)
    X_transform = vectorizer.fit_transform(list_of_lists).toarray()
    features_voc = vectorizer.get_feature_names()
    return X_transform, features_voc

def build_feature_matrices_test(list_of_lists):
    # vectorize using loaded features
    vectorizer = CountVectorizer(analyzer=lambda x: x, vocabulary = features_voc, lowercase=False)
    X_transform = vectorizer.fit_transform(list_of_lists).toarray()
    return X_transform

## Заменить на lightgbm

In [26]:
def build_clf(X, Y):
    clf = RandomForestClassifier()
    # fit Naive Bayes classifier according to X, y
    clf.fit(X, Y)
    print("Accuracy: ",clf.score(X, Y))
    return clf

def predict_clf(X, Y):
    predictions = clf.predict(X)
    print("Accuracy: ",clf.score(X, Y))
    

In [28]:
df = pd.read_table('reviews_clean.csv', header=0, error_bad_lines=False, delimiter='|')
train, test = train_test_split(df, test_size = 0.5, random_state = 222)

    


In [32]:
review_word_lists = tokenize_text(train.text)
X_transform, features_voc = build_feature_matrices(review_word_lists)
#clf = build_clf(X_transform, train['label'])   

Accuracy:  0.984727966911


In [33]:
#Test
review_word_lists_test = tokenize_text(test.text)
X_transform_test = build_feature_matrices_test(review_word_lists_test)
#predict_clf(X_transform_test,test['label'])

Accuracy:  0.66446214064
