In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords

In [4]:
df = pd.read_csv('reviews.csv')

In [5]:
data = df[['Text', 'Score']]

In [6]:
data.head()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


In [7]:
conjunct = ['and', 'moreover', 'in addition', 'additionally', 'further', 'furthermore', 'along with', 'as well as', 'also',
           'but', 'however', 'in contrast', 'instead', 'on the other hand', 'whereas', 'except that', 'on the contrary', 
            'conversely', 'nevertheless', 'although', 'alternatively',
            'if', 'unless', 'even if', 'even until', 'as long as', 'supposing', 'in case', 'only if', 'when',
            'or', 'similarly', 'equally', 'likewise']

additive = ['and', 'moreover', 'in addition', 'additionally', 'further', 'furthermore', 'along with', 'as well as', 'also']
contrast = ['but', 'however', 'in contrast', 'instead', 'on the other hand', 'whereas', 'except that', 'on the contrary', 
            'conversely', 'nevertheless', 'although', 'alternatively']
conditional = ['if', 'unless', 'even if', 'even until', 'as long as', 'supposing', 'in case', 'only if', 'when']
similarity = ['or', 'similarly', 'equally', 'likewise']

In [8]:
data.describe()

Unnamed: 0,Score
count,568454.0
mean,4.183199
std,1.310436
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [9]:
one_sentence = data['Text'].apply(lambda x: x.count('.') == 1 if type(x) == str else 0)
yes_conjunct = data['Text'].apply(lambda x: any([' '+ c + ' ' in x for c in conjunct]) if type(x) == str else False)
df1 = data.loc[yes_conjunct == True].loc[one_sentence == True]
df1['Score'].value_counts()

5    23286
4     3391
1     1841
3     1586
2      989
Name: Score, dtype: int64

In [10]:
sum(one_sentence)

38447

In [11]:
len(df1)

31093

In [12]:
conjunction = df1['Text'].apply(lambda x: [c for c in conjunct if c in x])
all_conjuncts = sum(conjunction.tolist(), [])
vocab = list(set(all_conjuncts))
print(vocab)

['additionally', 'although', 'further', 'in addition', 'on the other hand', 'along with', 'instead', 'unless', 'as well as', 'when', 'as long as', 'or', 'also', 'nevertheless', 'in case', 'only if', 'but', 'if', 'even if', 'however', 'whereas', 'likewise', 'except that', 'equally', 'and', 'furthermore']


In [13]:
conjunct_dict = {c: all_conjuncts.count(c) for c in vocab}
sorted(conjunct_dict.items(), key = lambda x:x[1], reverse = True)
conjunct_type_count = {[additive, contrast, conditional, similarity].index(x): sum([conjunct_dict[w] for w in x if w in list(conjunct_dict.keys())]) for x in [additive, contrast, conditional, similarity]}
conjuncts_by_type = {x[0]: [w for w in x if w in list(conjunct_dict.keys())] for x in [additive, contrast, conditional, similarity]}
conjuncts_by_type

{'and': ['and',
  'in addition',
  'additionally',
  'further',
  'furthermore',
  'along with',
  'as well as',
  'also'],
 'but': ['but',
  'however',
  'instead',
  'on the other hand',
  'whereas',
  'except that',
  'nevertheless',
  'although'],
 'if': ['if', 'unless', 'even if', 'as long as', 'in case', 'only if', 'when'],
 'or': ['or', 'equally', 'likewise']}

In [17]:
# replace all conjunct with but and if or
def replace(x, conjunct = conjuncts_by_type):
    for key in conjuncts_by_type.keys():
        for val in conjuncts_by_type[key]:
            x = x.replace(val, key)
    return x

# data cleaning
REPLACE_NO_SPACE = re.compile("(\.)|(\\\)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\r)|(\n)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
stop_words = set(stopwords.words('english'))

def clean(review):
    review = REPLACE_NO_SPACE.sub("", review.lower())
    review = REPLACE_WITH_SPACE.sub(" ", review)
    return review

In [18]:
order = ['but', 'if', 'and', 'or']
df1['review'] = df1['Text'].apply(lambda x: replace(x))
df1['review'] = df1['review'].apply(lambda x: clean(x))
df1['conj'] = df1['review'].apply(lambda x: [c for c in x.split(' ') if c in order])
train = [df1.loc[s] for s in df1.index if len(df1['conj'].loc[s]) == 1]
train_dict = {}
for conj in order:
    val = [x['review'].split(' ' + conj + ' ') + [x['Score']] for x in train if ' ' + conj + ' ' in x['review']]
    train_dict[conj] = val
len(train)

13373

In [27]:
def p(x):
    if list(swn.senti_synsets(x)) == []:
        return 0
    pos = [w.pos_score() for w in swn.senti_synsets(x) if w.pos_score() > 0]
    neg = [w.neg_score() for w in swn.senti_synsets(x) if w.neg_score() > 0]
    if pos == []:
        pos = 0
    else:
        pos = np.mean(pos)
    
    if neg == []:
        neg = 0
    else:
        neg = np.mean(neg)
    return 3*(pos - neg)


train_matrix = {}
for key in order:
    data = train_dict[key]
    matrix = []
    for s in data:
        matrix.append([sum([p(x) for x in s[0]]), sum([p(x) for x in s[1]]), (s[2]-3)/2])
    train_matrix[key] = np.matrix(matrix)

In [28]:
weights = {}
for conj in order:
    X = train_matrix[conj][:, 0:2]
    y = train_matrix[conj][:, 2]
    linear = LinearRegression()
    linear.fit(X, y)
    weights[conj] = linear.coef_[0,:].tolist() + linear.intercept_.tolist()

In [29]:
X1 = np.concatenate((train_matrix['but'], train_matrix['and'], train_matrix['or'], train_matrix['if']), axis = 0)
X1

matrix([[ 1.875,  7.5  ,  0.5  ],
        [-2.625,  0.75 , -0.5  ],
        [-5.25 ,  1.5  ,  1.   ],
        ...,
        [ 0.   , -2.25 ,  1.   ],
        [-0.375,  1.125, -1.   ],
        [ 1.5  , -1.125,  1.   ]])

In [30]:
linear = LinearRegression()
linear.fit(X1[:100, 0] + X1[:100, 1], X1[:100, 2])
y1 = linear.predict(X1[:, 0] + X1[:, 1])
np.sqrt(mean_squared_error(X1[:, 2]*2+3, y1*2+3))

1.1982319908717043

In [31]:
weights

{'but': [0.01463208486813129, 0.0027068563790658023, 0.45247339141720627],
 'if': [0.00863337034292694, 0.002974153231389477, 0.6999628374604729],
 'and': [0.007006055384766385, 0.006159608305781145, 0.8092985892679921],
 'or': [0.006660713821057349, 0.004962421336278101, 0.7750780569119118]}

In [32]:
def logic_tree(s, order):
    conj = [x for x in s if x in order]
    if conj == []:
        return s
    node = {}
    node['value'] = order[min([order.index(x) for x in conj])]
    node['left'] = logic_tree(s[:s.index(node['value'])], order)
    node['right'] = logic_tree(s[s.index(node['value'])+1:], order)
    return node

def score(t, weights):
    if type(t) != dict:
        return sum([1 for x in t])
    conj =  t['value']
    return (weights[conj][0] * score(t['left'], weights) 
            + weights[conj][1] * score(t['right'], weights)
            + weights[conj][2])


In [35]:
df1['logic_tree'] = df1['review'].apply(lambda x: logic_tree(x.split(' '), order))
df1['fitted'] = df1['logic_tree'].apply(lambda x: round(score(x, weights)*2 + 2.7, 1) 
                                        if (score(x, weights)*2 + 2.7 < 5) else 5)
np.sqrt(mean_squared_error(df1['Score'], df1['fitted']))

1.1245011973246404

In [36]:
tf = TfidfVectorizer()
tf.fit(df1['review'])
all_X = tf.transform(df1['review'])

In [38]:
lr = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg')
lr.fit(all_X[:160, :], df1['Score'].tolist()[:160])
y = lr.predict(all_X[160:])
np.sqrt(mean_squared_error(df1['Score'].tolist()[160:], y))

1.2436370232566718

In [40]:
lr = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg')
lr.fit(all_X[:600, :], df1['Score'].tolist()[:600])
y = lr.predict(all_X)
np.sqrt(mean_squared_error(df1['Score'].tolist(), y))

1.2422208329000117