In [195]:
#importing libraries
import pandas as pd
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import collections
from nltk.stem import WordNetLemmatizer
import itertools
from collections import Counter

### Loading comments and model-brands data


In [103]:
data = pd.read_csv('comments_raw.csv')[:5000]
data.head()

Unnamed: 0,page,comment_id,user_id,date,comment
0,1,3504465,410384,"April 11, 2007 6:52PM",Hi Pat:You forgot the Chrysler Sebring
1,1,3515400,209396,"April 11, 2007 7:33PM",I'm sure some folks would appreciate having th...
2,1,3516719,457562,"April 12, 2007 6:51AM",You can try to revive this topic but without b...
3,1,3517791,410027,"April 12, 2007 8:43AM",Model vs. model is exactly what we're here for...
4,1,3518875,411850,"April 13, 2007 11:49AM",The Altima is my favorite of the bunch. It is ...


In [104]:
model = pd.read_csv('models.csv')
model.head()

Unnamed: 0,acura,integra
0,acura,Legend
1,acura,vigor
2,acura,rlx
3,acura,ILX
4,acura,MDX


In [105]:
model_dict = dict(zip(model.integra, model.acura))

In [106]:
model_dict['integra']='acura'

In [107]:
model_dict['cars']

'car'

#### Removing non-brand keys from the model-brand dictionary

In [108]:
del model_dict["cars"]
del model_dict["seats"]
del model_dict["problems"]
del model_dict["sedans"]

#### Replacing models with brands

In [109]:
def replace_all(text):
    for i, j in model_dict.items():
        text = text.replace(j.lower(), i.lower())
    return text

In [110]:
data['com_replaced'] = data['comment'].apply(replace_all)

In [111]:
models_input = list(set(list(model_dict.values())))
models_input

['audi',
 'toyata',
 'kia',
 'ford',
 'dodge',
 'volvo',
 'pontiac',
 'acura',
 'subaru',
 'cadillac',
 'hyndai kia',
 'buick',
 'lincoln',
 'chrysler',
 'mitsubishi',
 'honda',
 'mazda',
 'volkswagen',
 'hyundai',
 'infiniti',
 'mercedes',
 'chevrolet',
 'saturn',
 'mercury',
 'bmw',
 'suzuki',
 'toyota',
 'nissan']

In [112]:
brand_list = models_input
#brand_list.remove("problem")
#brand_list.remove("car")
#brand_list.remove("seat")
#brand_list.remove("sedan")

In [113]:
wl = WordNetLemmatizer()

### Data cleaning and pre-processing

#### Getting part-of-speech tags for comments

In [114]:
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word.lower(), pos_dict.get(tag[0])]))
    return newlist

data['pos'] = data['comment'].apply(token_stop_pos)

#### lemmatizing comments

In [115]:
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wl.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

data['Lemma'] = data['pos'].apply(lemmatize)

In [116]:
data.head()

Unnamed: 0,page,comment_id,user_id,date,comment,com_replaced,pos,Lemma
0,1,3504465,410384,"April 11, 2007 6:52PM",Hi Pat:You forgot the Chrysler Sebring,Hi Pat:You forgot the Chrysler Sebring,"[(hi, n), (pat, n), (:, None), (forgot, v), (c...",hi pat : forget chrysler sebring
1,1,3515400,209396,"April 11, 2007 7:33PM",I'm sure some folks would appreciate having th...,I'm sure some folks would appreciate having th...,"[('m, v), (sure, a), (folks, n), (would, None)...",'m sure folk would appreciate malibu include...
2,1,3516719,457562,"April 12, 2007 6:51AM",You can try to revive this topic but without b...,You can try to revive this topic but without b...,"[(try, v), (revive, v), (topic, n), (without, ...",try revive topic without able discuss ( howe...
3,1,3517791,410027,"April 12, 2007 8:43AM",Model vs. model is exactly what we're here for...,Model vs. model is exactly what we're here for...,"[(model, n), (vs., None), (model, n), (exactly...",model vs. model exactly 're ! manufacturer v...
4,1,3518875,411850,"April 13, 2007 11:49AM",The Altima is my favorite of the bunch. It is ...,The Altima is my favorite of the bunch. It is ...,"[(altima, n), (favorite, n), (bunch, n), (., N...",altima favorite bunch . amongst fast best ha...


#### performing sentiment analysis on comments

This is to understand the context of the aspirational words used with reference to brands.
TextBlob functions gives output in the form a tuple with two values:
- Positivity (range: -1 to 1): Here, 1 and -1 represent extreme positive and negative ends respectively.
- Subjectivity (range: 0 to 1): Here, 0 represents the statement in general context and one represents the statement in subjective context.

In [117]:
def datablob(lemma):
    return TextBlob(lemma).sentiment

data['Blob'] = data['Lemma'].apply(datablob)

#### Defining a set of aspirational words to look out for in comments

In [118]:
aspiration = ['premium', 'luxury', 'lux', 'grace', 'style', 'buy', 'wishlist', 'wish', 'own', 'dream', 'expensive', 
              'class', 'smooth', 'pricey', 'elite', 'favorite','appreciate', 'brand', 'have']

#### Finding aspirational words from above defined list in the comments

In [119]:
def aspire(Lemma):
    tokens = word_tokenize(Lemma)
    aspire_words=[]
    for item in tokens: 
        if item.lower() in (string.lower() for string in aspiration) and item.lower() not in (string.lower() for string in aspire_words):
            aspire_words.append(item.lower())
    return aspire_words

data['Aspire'] = data['Lemma'].apply(aspire)

#### Only keeping rows which detected aspirational words

In [120]:
df = data.loc[data['Aspire'].str.len() >= 1]

In [121]:
pos_col = df['pos'].to_list()
#pos_col
lst_comment_words = []
for sentence in pos_col:
    new_sentence = []
    for t in sentence:
        new_sentence.append(t[0])
    lst_comment_words.append(new_sentence)

In [122]:
df['lst_of_words'] = lst_comment_words

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lst_of_words'] = lst_comment_words


#### Getting brand names from the comments

In [123]:
model_lookup = pd.read_csv('models.csv',header=None)
brands_np = model_lookup.iloc[:,0].unique()
model_np = model_lookup.iloc[:,1].unique()

def get_brand(sentence):
    result = []
    # lower case all strings
    sentence_lower = [x.lower() for x in sentence]
    # drop duplicates
    sentence_lower = list(dict.fromkeys(sentence_lower))
    # go through the list of car brands
    for brand in brands_np:
        if brand.lower() in sentence_lower:
            if brand.lower() not in result:
                result.append(brand.lower())
    # go through the model list, retraive brand info
    for i,model in enumerate(model_lookup.iloc[:,1]):
        candidate_brands = []
        for j,word in enumerate(sentence_lower):
            if model.lower() == word:
                candidate_brands.append(model_lookup.iloc[i,0])
        if len(candidate_brands) == 1 and candidate_brands[0].lower() not in result:
            result.append(candidate_brands[0].lower())
        elif len(candidate_brands) > 1:
            flg = 0
            for brand_c in candidate_brands:
                if brand_c in result:
                    flg += 1
            if flg == 0:
                result.append(candidate_brands[0].lower())
                    
    return result

In [124]:
brands_in_comments = list(map(lambda x: get_brand(x), lst_comment_words))

In [125]:
df['names'] = brands_in_comments

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['names'] = brands_in_comments


#### Dropping other columns

In [127]:
df1 = df.drop(['page', 'comment_id', 'comment', 'user_id', 'date'], axis=1)

#### Filtering data based on positivity and subjectivity
- Positivity: Keeping data with positive values above 0 so that the brands are mentioned in positive context wrt aspirational words.
- Subjectivity: Keeping data with subjective score>=0.5 so that we can assume its the subjective opinion of user regarding the brand which would suggest the user's preference towards the brand.

In [128]:
df2 = df1.loc[(df1['Blob'].str[0] >= 0) & (df1['Blob'].str[1] >= 0.5)]
df2.drop(['pos', 'Blob'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [129]:
df2.head()

Unnamed: 0,com_replaced,Lemma,Aspire,lst_of_words,names
1,I'm sure some folks would appreciate having th...,'m sure folk would appreciate malibu include...,[appreciate],"['m, sure, folks, would, appreciate, malibu, i...",[chevrolet]
4,The Altima is my favorite of the bunch. It is ...,altima favorite bunch . amongst fast best ha...,"[favorite, expensive]","[altima, favorite, bunch, ., amongst, fastest,...","[car, ford, hyundai, kia, mazda, nissan, honda..."
9,"My daily driver is an '03 Maxima, and the '07 ...","daily driver '03 maximum , '07 altima feel f...",[pricey],"[daily, driver, '03, maxima, ,, '07, altima, f...","[car, ford, mazda, nissan]"
11,Its interesting how that happens. There is no ...,interesting happen . real successor contour ...,[pricey],"[interesting, happens, ., real, successor, con...","[car, ford, mazda, nissan]"
24,I have been driving Accords for 16 years now. ...,"drive accord 16 year . 12 year first one , 4...","[smooth, buy, brand]","[driving, accords, 16, years, ., 12, years, fi...","[car, sedan, honda]"


### Calculations and frequencies

#### Calculating frequency of different aspirational words from the dataset

In [130]:
data_asp = data['Aspire'].to_list()
asp_count = [0]*len(aspiration)
for i,w in enumerate(aspiration):
    for sentence in data_asp:
        if w.lower() in sentence:
            asp_count[i] += 1

In [141]:
asp_count_pd = pd.DataFrame({'aspiration':aspiration,'count':asp_count})

#### Calculating frequency of combination of brands and aspirational words from the dataset

In [134]:
combinations = []
for w1 in brand_list:
    for w2 in aspiration:
        temp = (w1,w2)
        combinations.append(temp)

In [181]:
df1.shape

(1572, 7)

In [178]:
combination_freq = [0]*len(combinations)

lst_asp = df1.Aspire.to_list()
lst_brands = df1.names.to_list()
for i,c in enumerate(combinations):
    for j, asp in enumerate(lst_asp):
        if c[0].lower() in lst_brands[j] and c[1].lower() in asp:
            combination_freq[i] += 1

In [185]:
combo_count = pd.DataFrame({'combo':combinations,'count':combination_freq})#.to_csv('combo_count.csv')

In [186]:
combo_count

Unnamed: 0,combo,count
0,"(audi, premium)",2
1,"(audi, luxury)",2
2,"(audi, lux)",0
3,"(audi, grace)",0
4,"(audi, style)",2
...,...,...
527,"(nissan, elite)",0
528,"(nissan, favorite)",7
529,"(nissan, appreciate)",4
530,"(nissan, brand)",21


#### Preparing data for lift calculation

#### Converting brand count to dictionary

In [62]:
ctr_brands = pd.read_csv('brand_count.csv')
print(ctr_brands)

         brand  count
0          car   2331
1        honda   2061
2         ford   1339
3       toyota    937
4      hyundai    576
5        mazda    552
6       nissan    514
7        sedan    408
8      problem    343
9    chevrolet    235
10      saturn    231
11    chrysler    211
12        seat    194
13      subaru    153
14         kia    132
15         bmw    108
16  volkswagen    100
17     mercury     81
18       dodge     72
19       acura     61
20       buick     57
21     pontiac     56
22  mitsubishi     46
23     lincoln     45
24        audi     41
25       volvo     30
26    mercedes     26
27    cadillac     25
28    infiniti     10
29      suzuki      8


In [143]:
ctr_brands_car = ctr_brands.loc[~ctr_brands.brand.isin(['car','problem','sedan','seat'])]

In [144]:
N = len(data)

#### Calculating lift values for each (brand, aspirational word) pair

In [189]:
recorded_combo = []
lift=[]
ctr_brands['brand']

for w1,w2 in combinations:
    #print(w1,w2)
    if w1 in ctr_brands_car['brand'].to_list() and w2 in asp_count_pd['aspiration'].to_list():
        #print('here')
        pw1 = ctr_brands_car.loc[ctr_brands_car.brand==w1]['count'].values[0]
        pw2 = asp_count_pd.loc[asp_count_pd.aspiration==w2]['count'].values[0]
        combo_info = combo_count.loc[combo_count.combo == (w1, w2)]['count'].values[0]
        if pw1>0 and pw2>0:
            recorded_combo.append((w1,w2))
            lift.append(N*(combo_info/(pw1*pw2)))


In [190]:
combo_lift = pd.DataFrame({'recorded_combo':recorded_combo,'lift':lift})

In [192]:
combo_lift.to_csv('part_e_lift.csv')

## Sentiment Analysis
Recall that we performed sentiment analysis on comments. Count the most mentioned brands under comments that identified as positive sentiment.

In [194]:
df2.shape

(619, 5)

we have around 619 comments that identified as postive.

In [201]:
pos_brands = list(itertools.chain.from_iterable(df2.names.to_list()))

In [204]:
pos_brands_count = [x for x in pos_brands if x not in ['car','problem','sedan','seat']]

In [211]:
pos_brand_pd = pd.DataFrame(list(Counter(pos_brands_count).items()),columns=['brand','count']).sort_values(by='count',ascending=False)

which gives a similar results to brand counts, this is because many comments are surrounded around the top mentioned brands here.

In [213]:
merged_pd = pos_brand_pd.merge(ctr_brands_car, on='brand')
merged_pd.columns = ['brand','pos_count','total_count']

In [215]:
merged_pd['pos_pct'] = merged_pd['pos_count']/merged_pd['total_count']

In [217]:
merged_pd.sort_values(by='pos_pct',ascending=False)

Unnamed: 0,brand,pos_count,total_count,pos_pct
12,dodge,22,72,0.305556
9,kia,40,132,0.30303
23,infiniti,3,10,0.3
20,cadillac,7,25,0.28
10,bmw,30,108,0.277778
17,mitsubishi,12,46,0.26087
25,suzuki,2,8,0.25
19,audi,10,41,0.243902
6,saturn,56,231,0.242424
14,mercury,19,81,0.234568


In [218]:
merged_pd.to_csv('pos_comment_pct.csv')