In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df_train = pd.read_csv('train.csv', encoding='cp1252')
df_test  = pd.read_csv('test.csv', encoding='cp1252')
df_train.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0


In [3]:
df_train.shape

(2351, 5)

In [4]:
df_test.shape

(2352, 4)

Запишем положительные и отрицательные слова в 2 списка

In [5]:
pwords = np.array(open('positive-words.txt', encoding='cp1252').read().split())
nwords = np.array(open('negative-words.txt', encoding='cp1252').read().split())
pwords[:20]

array(['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable',
       'accessible', 'acclaim', 'acclaimed', 'acclamation', 'accolade',
       'accolades', 'accommodative', 'accomodative', 'accomplish',
       'accomplished', 'accomplishment', 'accomplishments', 'accurate',
       'accurately'], dtype='<U20')

Создадим функцию подсчета среднего числа колонки по названию отеля

In [6]:
def count_mean_rating(data, coln):
    ratings = data[coln]
    d  = dict()
    di = dict()
    for i in range(len(ratings)):
        cur = data['Hotel_name'][i]
        if cur in d.keys():
            d[cur]  += ratings[i]
            di[cur] += 1
        else:
            d[cur]  = ratings[i]
            di[cur] = 1
    for k in d.keys():
        d[k] /= di[k]
    ans = np.ones(len(ratings))
    for i in range(len(ratings)):
        ans[i] = d[data['Hotel_name'][i]]
    return ans

mean_rating = count_mean_rating(df_train, 'Rating')

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
X = pd.DataFrame({'mr': mean_rating})
y = pd.DataFrame({'Rating': df_train['Rating']})
#X = pd.get_dummies(X)
#print(int(-cross_val_score(LinearRegression(), X, y, cv=5, scoring='neg_mean_squared_error').mean()))

Запишем для каждого слова из pwords и nwords вес из SentiWordNet.txt

In [8]:
weights = dict()
f = True
with open('SentiWordNet.txt') as swn:
    lns = swn.readlines()
    for i in range(1, len(lns)):
        cur = lns[i].split()
        if len(cur) > 1 and cur[0] in pwords:
            weights[cur[0]] = float(cur[1])
        elif len(cur) > 2 and cur[0] in nwords:
            weights[cur[0]] = -float(cur[2])
        if (len(lns) / i > 0.1 and f):
            print(cur[1])
            f = False

0.125


In [9]:
weights['awesome']

0.875

Подсчитаем TF-IDF для заголовков и текстов

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

df_train['Review_Title'] = df_train['Review_Title'].fillna('')
df_train['Review_Text']  = df_train['Review_Text'].fillna('')

corp_title = np.array(df_train['Review_Title'])
corp_text  = np.array(df_train['Review_Text'])

vectorizer_title = TfidfVectorizer()
vectorizer_text  = TfidfVectorizer()
Xcorp_title = vectorizer_title.fit_transform(corp_title)
Xcorp_text  = vectorizer_text.fit_transform(corp_text)

def count_ws(corp, vect):
    d = dict()
    for i in range(corp.shape[1]):
        d[vect.get_feature_names()[i]] = i
    return d

ws_title_vect = count_ws(Xcorp_title, vectorizer_title)
ws_text_vect = count_ws(Xcorp_text, vectorizer_text)

Для заголовка и текста каждого отзыва подсчитаем суммарный вес из TF-IDF и SentiWordNet. Если перед словом из pwords или nwords стоит слово very, more, extremely и т.д. мы услиливаем его вес. Если же перед словом стоит отрицательная частица (not, no, never и тд), то мы перед весом ставим знак минус и прибавляем/вычитаем максимальный вес (если просто поставить знак минус то получиться, что not good лучше чем not awesome, что на самом деле не так)

In [16]:
import nltk

nltk.download('punkt')

def makep(data):
    ws_title = np.zeros(len(data['Id']))
    ws_text  = np.zeros(len(data['Id']))
    neg = []
    neg = ['no', 'not', 'never', 'without', 'nowhere', 'neither', 'nor', 'isn\'t', 'didn\'t']
    mor = ['very', 'extremely', 'really', 'more', 'absolute', 'absolutely']
    for i in range(len(ws_text)):
        prev = ''
        if not pd.isna(data['Review_Title'][i]):
            for word in nltk.word_tokenize(data['Review_Title'][i]):
                if word.lower() in pwords or word.lower() in nwords:
                    if prev in neg:
                        if word.lower() in nwords:
                            ws_title[i] += Xcorp_title.max()
                            pass
                        else:
                            ws_title[i] -= Xcorp_title.max()
                            pass
                    if word.lower() in weights.keys():
                        ws_title[i] += weights[word.lower()]
                        if prev in mor:
                            ws_title[i] += weights[word.lower()]
                    if word in ws_title_vect.keys():
                        if word.lower() in nwords:
                            ws_title[i] -= Xcorp_title[i, ws_title_vect[word]]
                            if prev in mor:
                                ws_title[i] -= Xcorp_title[i, ws_title_vect[word]]
                        else:
                            ws_title[i] += Xcorp_title[i, ws_title_vect[word]]
                            if prev in mor:
                                ws_title[i] += Xcorp_title[i, ws_title_vect[word]]
                prev = word.lower()
            
        prev = ''
        if pd.isna(data['Review_Text'][i]): continue
        for word in nltk.word_tokenize(data['Review_Text'][i]):
            if word.lower() in pwords or word.lower() in nwords:
                if prev in neg:
                    if word.lower() in nwords:
                        ws_text[i] += Xcorp_text.max()
                        pass
                    else:
                        ws_text[i] -= Xcorp_text.max() 
                        pass
                if word.lower() in weights.keys():
                    ws_text[i] += weights[word.lower()]
                    if prev in mor:
                            ws_text[i] += weights[word.lower()]
                if word in ws_text_vect.keys():
                    if word.lower() in nwords:
                        ws_text[i] -= Xcorp_text[i, ws_text_vect[word]]
                        if prev in mor:
                            ws_text[i] -= Xcorp_text[i, ws_text_vect[word]]
                    else:
                        ws_text[i] += Xcorp_text[i, ws_text_vect[word]]
                        if prev in mor:
                            ws_text[i] += Xcorp_text[i, ws_text_vect[word]]
            prev = word.lower()
    return ws_title, ws_text

ws_title, ws_text = makep(df_train)

[nltk_data] Downloading package punkt to /home/watemus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
ws_text[:30]

array([ 2.81657316,  2.36439832, -1.33335188,  0.38656196,  0.67826538,
        0.95989831,  3.0749068 , -0.36112393,  0.22700973, -0.44470738,
        3.0677214 ,  2.15981046, -1.04369424, -0.18532364, -1.74433313,
        0.45014109,  2.07022766, -0.45031318,  0.33854771, -0.52373493,
       -0.69038051,  4.77160916,  1.19152315,  1.57317388,  0.        ,
        1.86189628,  1.12617914,  0.9944533 ,  0.47320976,  0.7555258 ])

Добавим еще пару признаков (сумму суммарных весов текста и заголовка, средние разные суммарные веса по заголовку) и сделаем кросс-валидацию на RandomForestRegressor

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.utils.validation import column_or_1d
X['ws_text']  = ws_text
X['ws_title'] = ws_title
X['sws'] = X['ws_text'] + X['ws_title']
X['mr'] = mean_rating
X['Hotel_name'] = df_train['Hotel_name']
X['mws_text'] = count_mean_rating(X, 'ws_text')
X['mws_title'] = count_mean_rating(X, 'ws_title')
X['msws'] = count_mean_rating(X, 'sws')
X['smws'] = X['mws_title'] + X['mws_text']
del X['Hotel_name']
y = np.array(y)
y = column_or_1d(y, warn=True)
#print((X['mws_title']<-9.1).sum())
print(int(-cross_val_score(RandomForestRegressor(n_estimators=200, max_depth=50), X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=5).mean()))
X.head()

168


Unnamed: 0,mr,ws_text,ws_title,sws,mws_text,mws_title,msws,smws
0,79.924528,2.816573,0.0,2.816573,1.295506,0.346771,1.642277,1.642277
1,90.527778,2.364398,0.0,2.364398,1.805128,0.544207,2.349336,2.349336
2,66.555556,-1.333352,-0.735928,-2.06928,0.297394,0.085757,0.383151,0.383151
3,84.083333,0.386562,0.0,0.386562,1.736056,0.157511,1.893568,1.893568
4,79.924528,0.678265,1.085523,1.763788,1.295506,0.346771,1.642277,1.642277
