In [1]:
#importing libraries
import pandas as pd
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import collections
from nltk.stem import WordNetLemmatizer

### Loading comments and model-brands data


In [2]:
data = pd.read_csv('comments_raw.csv')[:5000]
data.head()

Unnamed: 0,page,comment_id,user_id,date,comment
0,1,3504465,410384,"April 11, 2007 6:52PM",Hi Pat:You forgot the Chrysler Sebring
1,1,3515400,209396,"April 11, 2007 7:33PM",I'm sure some folks would appreciate having th...
2,1,3516719,457562,"April 12, 2007 6:51AM",You can try to revive this topic but without b...
3,1,3517791,410027,"April 12, 2007 8:43AM",Model vs. model is exactly what we're here for...
4,1,3518875,411850,"April 13, 2007 11:49AM",The Altima is my favorite of the bunch. It is ...


In [3]:
model = pd.read_csv('models.csv')
model.head()

Unnamed: 0,acura,integra
0,acura,Legend
1,acura,vigor
2,acura,rlx
3,acura,ILX
4,acura,MDX


In [4]:
model_dict = dict(zip(model.integra, model.acura))

#### Removing non-brand keys from the model-brand dictionary

In [5]:
del model_dict["cars"]
del model_dict["seats"]
del model_dict["problems"]
del model_dict["sedans"]

#### Replacing models with brands

In [6]:
def replace_all(text):
    for i, j in model_dict.items():
        text = text.replace(i, j.lower())
    return text

In [7]:
data['com_replaced'] = data['comment'].apply(replace_all)

In [8]:
models_input = model.acura.unique()

In [9]:
brand_list = models_input.tolist()
brand_list.remove("problem")
brand_list.remove("car")
brand_list.remove("seat")
brand_list.remove("sedan")

In [10]:
wl = WordNetLemmatizer()

### Data cleaning and pre-processing

#### Getting part-of-speech tags for comments

In [11]:
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        #print(word,tag)
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word.lower(), pos_dict.get(tag[0])]))
    return newlist

data['pos'] = data['comment'].apply(token_stop_pos)

#### lemmatizing comments

In [12]:
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        #print(word,pos)
        if not pos:
            #print(lemma)
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            #print("else")
            lemma = wl.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

data['Lemma'] = data['pos'].apply(lemmatize)

#### performing sentiment analysis on comments

This is to understand the context of the aspirational words used with reference to brands.
TextBlob functions gives output in the form a tuple with two values:
- Positivity (range: -1 to 1): Here, 1 and -1 represent extreme positive and negative ends respectively.
- Subjectivity (range: 0 to 1): Here, 0 represents the statement in general context and one represents the statement in subjective context.

In [13]:
def datablob(lemma):
    return TextBlob(lemma).sentiment

data['Blob'] = data['Lemma'].apply(datablob)

#### Defining a set of aspirational words to look out for in comments

In [14]:
aspiration = ['Premium', 'luxury', 'lux', 'grace', 'style', 'buy', 'wishlist', 'wish', 'own', 'dream', 'expensive', 
              'class', 'swift', 'smooth', 'pricey', 'elite', 'favorite', 'brand']

#### Finding aspirational words from above defined list in the comments

In [15]:
def aspire(Lemma):
    tokens = word_tokenize(Lemma)
    aspire_words=[]
    for item in tokens: 
        if item.lower() in (string.lower() for string in aspiration) and item.lower() not in (string.lower() for string in aspire_words):
            aspire_words.append(item.lower())
    return aspire_words

data['Aspire'] = data['Lemma'].apply(aspire)

#### Only keeping rows which detected aspirational words

In [16]:
df = data.loc[data['Aspire'].str.len() >= 1]

#### Getting brand names from the comments

In [17]:
def get_name(pos_data):
    nouns = []
    for word, pos in pos_data:
        #print(word,pos)
        if pos=="n" and word.lower() in (string.lower() for string in brand_list) and word.lower() not in (string.lower() for string in nouns):
            nouns.append(word.lower())
    return nouns

df['names'] = df['pos'].apply(get_name)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['names'] = df['pos'].apply(get_name)


Unnamed: 0,page,comment_id,user_id,date,comment,com_replaced,pos,Lemma,Blob,Aspire,names
4,1,3518875,411850,"April 13, 2007 11:49AM",The Altima is my favorite of the bunch. It is ...,The Altima is my favorite of the bunch. It is ...,"[(altima, n), (favorite, n), (bunch, n), (., N...",altima favorite bunch . amongst fast best ha...,"(0.15833333333333335, 0.5805555555555557)","[favorite, expensive]","[ford, nissan, mazda, hyundai, kia]"
6,1,3521034,410821,"April 13, 2007 12:18PM",Buick LaCrossePassat(Audi A6 in non-lux trim)V...,Buick LaCrossePassat(Audi A6 in non-lux trim)V...,"[(buick, n), (lacrossepassat, n), ((, None), (...",buick lacrossepassat ( audi a6 non-lux trim ...,"(-0.038888888888888896, 0.33888888888888885)",[luxury],"[buick, audi, volvo]"
9,1,3504466,380418,"April 13, 2007 12:32PM","My daily driver is an '03 Maxima, and the '07 ...","My daily driver is an '03 Maxima, and the '07 ...","[(daily, a), (driver, n), ('03, a), (maxima, n...","daily driver '03 maximum , '07 altima feel f...","(0.1903030303030303, 0.5237878787878788)",[pricey],[]
10,1,3505565,411850,"April 13, 2007 12:33PM",P.S. the CVT in the Altima has to be driven li...,P.S. the CVT in the Altima has to be driven li...,"[(p.s, n), (., None), (cvt, n), (altima, n), (...",p.s . cvt altima drive like motorcycle . nee...,"(0.12023809523809523, 0.4988095238095238)","[style, buy]",[]
11,1,3506661,411850,"April 13, 2007 12:39PM",Its interesting how that happens. There is no ...,Its interesting how that happens. There is no ...,"[(interesting, a), (happens, v), (., None), (r...",interesting happen . real successor contour ...,"(0.31722222222222224, 0.582962962962963)",[pricey],[ford]


#### Dropping other columns

In [18]:
df1 = df.drop(['page', 'comment_id', 'comment', 'user_id', 'date'], axis=1)

In [19]:
#df1.to_csv('df1.csv', index=False)

#### Filtering data based on positivity and subjectivity
- Positivity: Keeping data with positive values above 0 so that the brands are mentioned in positive context wrt aspirational words.
- Subjectivity: Keeping data with subjective score>=0.5 so that we can assume its the subjective opinion of user regarding the brand which would suggest the user's preference towards the brand.

In [20]:
df2 = df1.loc[(df1['Blob'].str[0] >= 0) & (df1['Blob'].str[1] >= 0.5)]
df2.drop(['pos', 'Blob'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [21]:
df2.head()

Unnamed: 0,com_replaced,Lemma,Aspire,names
4,The Altima is my favorite of the bunch. It is ...,altima favorite bunch . amongst fast best ha...,"[favorite, expensive]","[ford, nissan, mazda, hyundai, kia]"
9,"My daily driver is an '03 Maxima, and the '07 ...","daily driver '03 maximum , '07 altima feel f...",[pricey],[]
11,Its interesting how that happens. There is no ...,interesting happen . real successor contour ...,[pricey],[ford]
24,I have been driving Accords for 16 years now. ...,"drive accord 16 year . 12 year first one , 4...","[smooth, buy, brand]",[]
35,I just read in my Dec. 2006 issue of Motor Tre...,read dec. 2006 issue motor trend poor amount...,[class],[]


In [22]:
#df2.to_csv("df2.csv", index=False)

### Calculations and frequencies

#### Calculating frequency of different brands from the df2 dataset

In [23]:
brands = []
def get_freq(data):
    for word in data:
        brands.append(word)

df2['names'].apply(get_freq)

4       None
9       None
11      None
24      None
35      None
        ... 
4965    None
4975    None
4977    None
4982    None
4996    None
Name: names, Length: 610, dtype: object

#### Calculating frequency of different aspirational words from the df2 dataset

In [24]:
aspire_freq = []
def get_freq_aspire(data):
    for word in data:
        aspire_freq.append(word)

df2['Aspire'].apply(get_freq_aspire)

4       None
9       None
11      None
24      None
35      None
        ... 
4965    None
4975    None
4977    None
4982    None
4996    None
Name: Aspire, Length: 610, dtype: object

#### Calculating frequency of combination of brands and aspirational words from the df2 dataset

In [26]:
combinations = []
for w1 in brand_list:
    for w2 in aspiration:
        temp = (w1,w2)
        combinations.append(temp)

In [34]:
combination_freq=[0]*len(combinations)

def get_combo_freq(comment):
    for w1,w2 in combinations:
        if w1.lower() in comment.lower() and w2.lower() in comment.lower():
            combination_freq[combinations.index((w1,w2))]+=1
                             
df2['com_replaced'].apply(get_combo_freq)  

4       None
9       None
11      None
24      None
35      None
        ... 
4965    None
4975    None
4977    None
4982    None
4996    None
Name: com_replaced, Length: 610, dtype: object

#### Preparing data for lift calculation

#### Converting brand count to dictionary

In [35]:
ctr_brands = dict(collections.Counter(brands))
print(ctr_brands)

{'ford': 111, 'nissan': 29, 'mazda': 55, 'hyundai': 36, 'kia': 18, 'bmw': 23, 'buick': 6, 'cadillac': 7, 'honda': 144, 'audi': 9, 'chrysler': 22, 'dodge': 15, 'toyota': 62, 'acura': 10, 'mitsubishi': 5, 'saturn': 14, 'subaru': 12, 'lincoln': 6, 'mercury': 9, 'mercedes': 5, 'chevrolet': 3, 'infiniti': 1, 'suzuki': 2, 'volvo': 3, 'pontiac': 2, 'volkswagen': 1}


#### Converting aspirational word count to dictionary

In [36]:
ctr_aspire = dict(collections.Counter(aspire_freq))
print(ctr_aspire)

{'favorite': 20, 'expensive': 32, 'pricey': 9, 'smooth': 44, 'buy': 289, 'brand': 61, 'class': 98, 'own': 108, 'style': 71, 'wish': 31, 'premium': 31, 'luxury': 10, 'dream': 9, 'lux': 1, 'grace': 1}


#### Calculating lift values for each (brand, aspirational word) pair

In [37]:
lift=[0]*len(combinations)
i=0
pw1=0
pw2=0
for w1,w2 in combinations:
    if w1 in ctr_brands.keys():
        pw1 = ctr_brands[w1]
    if w2 in ctr_aspire.keys():
        pw2 = ctr_aspire[w2]
    if pw1>0 and pw2>0:
        lift[i] = combination_freq[i]/(pw1*pw2)
    i+=1
    pw1=0
    pw2=0

In [38]:
final = pd.DataFrame(list(zip(combinations, lift)), columns =['combination', 'lift'])

In [39]:
final.loc[final['lift'] > 0]

Unnamed: 0,combination,lift
1,"(acura, luxury)",0.020000
2,"(acura, lux)",0.200000
4,"(acura, style)",0.002817
5,"(acura, buy)",0.001038
7,"(acura, wish)",0.009677
...,...,...
491,"(volvo, buy)",0.001153
494,"(volvo, own)",0.009259
496,"(volvo, expensive)",0.010417
499,"(volvo, smooth)",0.007576


In [40]:
final.loc[final['lift'] > 0].to_csv("final.csv", index=False)