In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 1000) # to read records completely
import pickle
import numpy as np
import itertools
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

## Import data from previous preprocessing steps

In [2]:
# Importing scraped and pre-processed data  
df = pd.read_pickle('data_clean.pkl')

In [3]:
models = pd.read_csv('models.csv', header=None, names = ['brand', 'model'])
models['brand'] = models['brand'].str.replace(r'[^\w\s]+', '', regex=True) # remove punctuation errors (e.g. "nissan.")
models = models[~models.brand.isin(['car', 'sedan', 'problem'])] # remove rows that are not brands

In [4]:
# Convert all message words to lowercase
df.Message_words = df.Message_words.apply(lambda x: [i.lower() for i in x])

### Replace models with brands

In [5]:
messages = df.Message_words.to_list()

In [6]:
# Replace first occurence of brand
messages2 = []
t = 0
for m in messages:
    for i,row in models.iterrows():
        m = np.where(m == row['model'], row['brand'], m)
        
    t+=1
    
    #print(t)
        
    messages2.append(m)
        
df['Message_words_v2'] = messages2

In [7]:
# There exists occurences where the text says "nissan, nissan"
# Remove duplicates in lists while keeping order
def remove_consecutive_duplicate (text): #removes consecutive duplicates
    return np.array([i for i, j in itertools.groupby(text)])

df["Message_words_v3"] = df["Message_words_v2"].apply(remove_consecutive_duplicate)

## Link attributes to models

In [8]:
# Retrieve a unique list of brands in the text
brands_list = models.brand.drop_duplicates().to_list()

# Add additional brands not covered
brands_list.extend(['lexus', 'ferrari', 'merzedesbenz', 'tesla','gm', 'peugeot', 'jeep', 'bentley', 'fiat'])

In [9]:
messages = df.Message_words_v3.to_list()

# Returns a list of brands mentioned in each review
brands_in_message = []
for m in messages:
    
    brands_mentioned = []
    for brand in brands_list:
        if len(np.where(m == brand)[0]) >0:
            brands_mentioned.append(brand)
            
    brands_in_message.append(brands_mentioned)

brands_in_message

  if len(np.where(m == brand)[0]) >0:


[[],
 ['nissan'],
 [],
 ['audi', 'lexus'],
 ['nissan'],
 ['nissan'],
 ['audi'],
 ['audi'],
 [],
 ['bmw', 'honda', 'nissan', 'toyota'],
 [],
 ['audi'],
 ['audi'],
 ['audi', 'bmw', 'volkswagen'],
 [],
 ['audi', 'lexus'],
 [],
 [],
 [],
 ['toyota', 'volkswagen'],
 ['bmw', 'seat', 'toyota', 'volkswagen'],
 [],
 [],
 ['volkswagen'],
 ['volkswagen'],
 [],
 [],
 ['bmw'],
 ['audi', 'bmw', 'infiniti', 'lexus'],
 ['volvo'],
 ['audi', 'bmw', 'infiniti', 'nissan', 'lexus'],
 ['infiniti'],
 ['infiniti'],
 ['acura', 'mercedes', 'seat', 'volvo'],
 ['audi'],
 [],
 ['mercedes'],
 [],
 ['mazda', 'toyota'],
 ['nissan', 'toyota', 'volkswagen'],
 ['nissan', 'volkswagen'],
 ['mazda', 'nissan', 'toyota', 'volkswagen'],
 ['honda', 'nissan', 'toyota', 'volkswagen'],
 ['audi', 'chrysler', 'volkswagen'],
 ['mercedes', 'nissan'],
 ['mercedes', 'nissan'],
 [],
 ['bmw', 'nissan'],
 ['chevrolet', 'nissan', 'pontiac', 'gm'],
 ['chevrolet'],
 ['audi', 'volkswagen', 'volvo'],
 ['gm'],
 ['gm'],
 ['volkswagen', 'volvo'],

In [10]:
df['Brands_in_message'] = brands_in_message
df['Qty_brands'] = df.Brands_in_message.apply(lambda x: len(x))
df['Qty_brands'].value_counts()

# 1291 messages don't mention any brand

0     1244
1     1175
2      774
3      464
4      235
5      120
6       74
7       31
8       17
10       7
11       3
12       3
9        2
16       1
13       1
Name: Qty_brands, dtype: int64

We can assign attributes mentioned in the reviews, like the power of a car, to a specific brand. For example, in the sentence "I like the BMW for its power. On the other hand, the Honda is reliable" we would want to assign the the attribute "power" to "BMW" and "reliable" to "Honda". We have created functions for two different approaches:
1) assign all words found between 1st brand mention and next brand mention to 1st brand

2) assign n words to each side of the brand mention to the brand. We found n = 4 to work best. 

In [37]:
# First approach: link all words found between brand mention and next brand mention...
# ...except for the first brand mention which receives words from the start of the message

m = df.Message_words_v3[9] # record 9 is a good example

def get_attr_in_the_right(m, brand_list):

    brand_dic = {}
    for brand in brands_list:
        ix = np.where(m == brand)[0]
        if len(ix) > 0:
            brand_dic[brand] = ix[0]

    # Dict of brand_dic['brand'] = {all words to the right}
    brand_dic = dict(sorted(brand_dic.items(), key=lambda x:x[1]))
    
    # Combined list of all words
    brand_list = list(brand_dic.values()) + [len(m)]

    review={}
    for i, tup in enumerate(brand_dic):        
        if i == 0:            
            review[tup] = m[0:brand_list[1]]
        else:
            review[tup] = m[brand_list[i]:brand_list[i+1]]
    return review

get_attr_in_the_right(m, brands_list)

{'honda': array(['would', 'recommend', 'honda'], dtype='<U158'),
 'toyota': array(['toyota', 'choices', 'sedan', 'new'], dtype='<U158'),
 'nissan': array(['nissan', 'looks', 'impressive', 'would', 'wait', 'reliable',
        '2002', 'proventhe', 'new'], dtype='<U158'),
 'bmw': array(['bmw', '525', 'another', 'nice', 'car', 'rated', 'highly', 'many',
        'auto', 'publications'], dtype='<U158')}

In [38]:
pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in get_attr_in_the_right(m, brands_list).items() ]))


Unnamed: 0,honda,toyota,nissan,bmw
0,would,toyota,nissan,bmw
1,recommend,choices,looks,525
2,honda,sedan,impressive,another
3,,new,would,nice
4,,,wait,car
5,,,reliable,rated
6,,,2002,highly
7,,,proventhe,many
8,,,new,auto
9,,,,publications


In [12]:
# Do this for all records

brand_reviews_1st_approach = []

for m in df.Message_words_v3:

    brand_reviews_1st_approach.append(get_attr_in_the_right(m, brands_list))

# generates a list (1 entry per row) of dictionaries {brand_1: part of text corresponding, 
# brand_2: part of text corresponding,}

  ix = np.where(m == brand)[0]


In [47]:
# 2nd approach: consider n words to each side when associating with brands

m = df.Message_words_v3[9]

def get_attr_n_per_side(m, n, brand_list):

    brand_dic = {}
    for brand in brands_list:
        ix = np.where(m == brand)[0]
        if len(ix) > 0:
            brand_dic[brand] = ix[0]


    brand_dic = dict(sorted(brand_dic.items(), key=lambda x:x[1]))
    
    #print(len(brand_dic))

    brand_list = list(brand_dic.values()) + [len(m)]
    
    #print(brand_list)

    review={}
    for i, tup in enumerate(brand_dic):

        left_loc = (brand_list[i] - n)
        left_loc = max(0, left_loc) # to avoid out of range indexing
        
        right_loc = (brand_list[i] + n)
        right_loc = min(len(m), right_loc) # to avoid OOF indexing
        
        review[tup] = m[left_loc:right_loc]
            
    return review

get_attr_n_per_side(m, 5, brands_list)        

{'honda': array(['would', 'recommend', 'honda', 'toyota', 'choices', 'sedan', 'new'],
       dtype='<U158'),
 'toyota': array(['would', 'recommend', 'honda', 'toyota', 'choices', 'sedan', 'new',
        'nissan'], dtype='<U158'),
 'nissan': array(['honda', 'toyota', 'choices', 'sedan', 'new', 'nissan', 'looks',
        'impressive', 'would', 'wait'], dtype='<U158'),
 'bmw': array(['wait', 'reliable', '2002', 'proventhe', 'new', 'bmw', '525',
        'another', 'nice', 'car'], dtype='<U158')}

In [48]:
# Review df.iloc[9]
pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in get_attr_n_per_side(m, 4, brands_list).items() ]))


Unnamed: 0,honda,toyota,nissan,bmw
0,would,would,toyota,reliable
1,recommend,recommend,choices,2002
2,honda,honda,sedan,proventhe
3,toyota,toyota,new,new
4,choices,choices,nissan,bmw
5,sedan,sedan,looks,525
6,,new,impressive,another
7,,,would,nice


In [51]:
# Review df.iloc[62]
m = df.Message_words_v3[62]
pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in get_attr_n_per_side(m, 4, brands_list).items() ]))


Unnamed: 0,mazda,nissan,toyota,honda
0,would,626,country,cars
1,certainly,would,cars,without
2,qualify,similar,without,pricey
3,1992,age,pricey,toyota
4,mazda,nissan,toyota,honda
5,mx6,good,honda,cars
6,loves,bet,cars,age
7,car,older,age,


In [14]:
# Do this for all records
brand_reviews_2nd_approach = []

for m in df.Message_words_v3:

    brand_reviews_2nd_approach.append(get_attr_n_per_side(m,4, brands_list))

# Generates a list (each row is a post) of dictionaries where {brand_1: part of text corresponding, brand_2: part of text corresponding}

  ix = np.where(m == brand)[0]


In [15]:
# Concatenate all reviews for each brand into a single key in a dictionary
# Creates dictionary: each brand mentioned is a key, value is all words assigned to that brand 
# According to 1st method get_attr_in_the_right()

review_accum_1st = {i:[] for i in brands_list} #deprecated dictionary but still interesting

for brand in brands_list:
    for review in brand_reviews_1st_approach:
        for single_review in review:
            #print(review[single_review])
            if single_review==brand:
                review_accum_1st[brand].extend(review[single_review])

In [16]:
# Same as above but for 2nd method: get_attr_n_per_side()
review_accum_2nd = {i:[] for i in brands_list} 

for brand in brands_list:
    for review in brand_reviews_2nd_approach:
        for single_review in review:
            #print(review[single_review])
            if single_review==brand:
                review_accum_2nd[brand].extend(review[single_review])

In [17]:
# Create Series with all words 
words = pd.Series([i for review in df.Message_words_v3 for i in review])

In [18]:
# Get value count for words excluding brands
words[~words.isin(brands_list)].value_counts().head(60)


car            3874
like           1900
would          1900
get            1797
new            1632
cars           1554
good           1491
one            1447
im             1120
also           1115
dont           1055
drive          1013
miles           979
much            959
used            945
want            932
years           928
price           917
better          916
think           902
well            845
looking         833
really          790
buy             740
know            722
go              671
need            654
year            651
even            650
could           628
great           624
ive             590
vehicle         576
time            570
might           569
driving         563
lot             545
look            541
something       533
best            528
going           525
id              514
old             513
engine          508
3               504
nice            498
civic           496
still           489
probably        489
reliability     485


In [19]:
words_count = words.value_counts()
words_nobrands_count = words[~words.isin(brands_list)].value_counts()

### Stemming

In [None]:
# NOTE: Stemming could potentially cause information loss if words are unintentionally reduced in a way that would result in a 
# common stem being used for different words 
''' Stemming
stemmer = PorterStemmer()

def stem_list (text):
    return pd.Series([stemmer.stem(w) for w in text])

stemmed_words = stem_list(words)
len(stemmed_words)

stemmed_words[~stemmed_words.isin(brands_list)].value_counts().head(60)

stemwords_count = stemmed_words.value_counts()
stemwords_nobrands_count = stemmed_words[~stemmed_words.isin(brands_list)].value_counts()
'''


In [None]:
#For words_nobrands_count remove words that appear 2x or less
words_nobrands_count = words_nobrands_count[words_nobrands_count > 2]
words_nobrands_count

# This csv is used to create the attribute map
#words_nobrands_count.to_csv("attribute_count_final.csv")


## Attribute mapping

Attribute mapping was done by manually browsing rows of xlsx and linking words we found relevant to a common key

In [20]:
df_map = pd.read_excel("attribute_count_final.xlsx", sheet_name = 0)
df_map = df_map.drop('Unnamed: 4', axis=1)

In [21]:
# Where no replacement exists, fill it with the original word
df_map['attribute_flg'] = df_map['attribute_flg'].replace(np.nan, 0)
df_map.attribute_synonym.fillna(df_map.word, inplace=True)


In [22]:
# Sum the frequencies of every attribute group after this and show top results 
df_pivot_map = df_map[df_map.attribute_flg == 1.0].groupby('attribute_synonym').freq.sum().sort_values(ascending=False)
df_pivot_map.head(10)

attribute_synonym
affordability     4509.0
sustainability    1902.0
size              1829.0
driveability      1535.0
engine            1433.0
visual            1202.0
reliable          1012.0
transmission       998.0
comfort            893.0
technological      867.0
Name: freq, dtype: float64

In [66]:
df_attr_count = pd.DataFrame(df_pivot_map.head(10))
df_attr_count = df_attr_count.rename(columns= {"freq": "total_count"})
df_attr_count

Unnamed: 0_level_0,total_count
attribute_synonym,Unnamed: 1_level_1
affordability,4509.0
sustainability,1902.0
size,1829.0
driveability,1535.0
engine,1433.0
visual,1202.0
reliable,1012.0
transmission,998.0
comfort,893.0
technological,867.0


The top 5 attributes are: affordability, sustainability, size, driveability and engine

In [None]:
# Replace words with attributes in df.Message_words_v3
def replace_attributes(m):
    for i,row in df_map.iterrows():
        m = np.where(m == row['word'], row['attribute_synonym'], m)
    return m

In [None]:
# Replaces attributes (Takes ~20-30 mins to run)
df["Message_words_v3"] = df["Message_words_v3"].apply(replace_attributes)

In [None]:
# Create checkpoint csv to avoid long function call later
# df.to_pickle("data_checkpoint.pkl")

In [57]:
df_pivot_map

attribute_synonym
affordability     4509.0
sustainability    1902.0
size              1829.0
driveability      1535.0
engine            1433.0
visual            1202.0
reliable          1012.0
transmission       998.0
comfort            893.0
technological      867.0
power              826.0
age                469.0
performance        436.0
country            315.0
safety             223.0
sound              127.0
torque              93.0
ambiguous           12.0
parts               11.0
Name: freq, dtype: float64

In [60]:
words

0                need
1                help
2            choosing
3                next
4             vehicle
             ...     
227209          35000
227210         toyota
227211     especially
227212       longterm
227213    reliability
Length: 227214, dtype: object

In [58]:
df_map.attribute_synon

Unnamed: 0,word,freq,attribute_flg,attribute_synonym
0,price,737.0,1.0,affordability
1,money,338.0,1.0,affordability
2,lease,311.0,1.0,affordability
3,value,273.0,1.0,affordability
4,cost,270.0,1.0,affordability
...,...,...,...,...
5356,nope,3.0,0.0,nope
5357,scrape,3.0,0.0,scrape
5358,850.0,3.0,0.0,850.0
5359,modelsi,3.0,0.0,modelsi


In [59]:
df

Unnamed: 0,Date,User_Id,Message,NumberOfPastPosts,Role,Message_words,Message_words_v3,Brands_in_message,Qty_brands
0,2001-09-01,pat,"Need help choosing your next vehicle? Tell us your price range, are you thinking New or Used, buying or leasing, what features are must-haves, what other thoughts are on your mind, and let us give you a hand! Tell us your criteria as specifically as you can.You'll find lots of helpful folks here who can give you useful suggestions.",10421,Member,"[need, help, choosing, next, vehicle, tell, us, price, range, thinking, new, used, buying, leasing, features, musthaves, thoughts, mind, let, us, give, hand, tell, us, criteria, specifically, canyoull, find, lots, helpful, folks, give, useful, suggestions]","[need, help, choosing, next, vehicle, tell, us, affordability, sustainability, thinking, new, used, buying, affordability, features, musthaves, thoughts, mind, let, us, give, hand, tell, us, criteria, specifically, canyoull, find, lots, helpful, folks, give, useful, suggestions]",[],0
1,2001-09-01,willow25,"I'm buying a ""new"" car by the end of the year and my first choice is a Nissan Maxima. I've read that they are very reliable and hold their value well. My budget is probably going to allow for a 98-99 and I was wondering if it's worth buying a car that's almost 3-4 years old. Any suggestions or comments would help a lot.willow25",2,Member,"[im, buying, new, car, end, year, first, choice, nissan, maxima, ive, read, reliable, hold, value, well, budget, probably, going, allow, 9899, wondering, worth, buying, car, thats, almost, 34, years, old, suggestions, comments, would, help, lotwillow25]","[im, buying, new, car, end, year, first, choice, nissan, ive, read, reliable, hold, affordability, well, affordability, probably, going, allow, 9899, wondering, affordability, buying, car, thats, almost, 34, years, age, suggestions, comments, would, help, lotwillow25]",[nissan],1
2,2001-10-01,dindak,"Buy a 2001/ 2002 Oldsmobile Intrigue. It's one of the best sedans on the market with great handling and one of the best V6 engines around, a 3.5L DOHC producing 215HP. The deals should be great this year as it's the last on for Intrigue and the warrany on them has been extended to 5 years. The car is recommended by most magazines (including Consumer Reports), but is unfortunately over looked by most consumers. It is the sleeper of all sleepers on the market in my opinion.",6632,Member,"[buy, 2001, 2002, oldsmobile, intrigue, one, best, sedans, market, great, handling, one, best, v6, engines, around, 35l, dohc, producing, 215hp, deals, great, year, last, intrigue, warrany, extended, 5, years, car, recommended, magazines, including, consumer, reports, unfortunately, looked, consumers, sleeper, sleepers, market, opinion]","[buy, 2001, 2002, oldsmobile, intrigue, one, best, sedans, market, great, driveability, one, best, engine, engine, around, 35l, dohc, producing, 215hp, deals, great, year, last, intrigue, warrany, extended, 5, years, car, recommended, magazines, including, consumer, reports, unfortunately, looked, consumers, sleeper, sleepers, market, opinion]",[],0
3,2001-10-01,peteri1,"Am looking at 1 of 4 choices to replace my 99GS400. I've read everything here and on other forums and have seen much conflicting info:4 options:Lexus LS430 with custom/luxury packageInfiniti Q45 with premium pckgAudi A-6 4.2 loaded Audi A-8 not stripped, but not loadedChicago winters--had Blizzaks on the GS--worked fine. My 95Q was pig in snow, but I'm sure 2002Q is a world away from that. I've heard Audi has some QC problems, but I like their maintenance being included. I'm a happy Lexus driver. The Q appears to offer the most for the $$. Will lease and in this economy, I would think I could command an attractive package. Can I go wrong with any of the 4? Am I missing something?",21,Member,"[looking, 1, 4, choices, replace, 99gs400, ive, read, everything, forums, seen, much, conflicting, info4, optionslexus, ls430, customluxury, packageinfiniti, q45, premium, pckgaudi, a6, 42, loaded, audi, a8, stripped, loadedchicago, wintershad, blizzaks, gsworked, fine, 95q, pig, snow, im, sure, 2002q, world, away, ive, heard, audi, qc, problems, like, maintenance, included, im, happy, lexus, driver, q, appears, offer, lease, economy, would, think, could, command, attractive, package, go, wrong, 4, missing, something]","[looking, 1, 4, choices, replace, 99gs400, ive, read, everything, forums, seen, much, conflicting, info4, optionslexus, ls430, customluxury, packageinfiniti, q45, premium, pckgaudi, audi, 42, loaded, audi, stripped, loadedchicago, wintershad, blizzaks, gsworked, fine, 95q, pig, snow, im, sure, 2002q, world, away, ive, heard, audi, qc, problems, like, affordability, included, im, happy, lexus, driver, q, appears, offer, affordability, sustainability, would, think, could, command, attractive, package, go, wrong, 4, missing, something]","[audi, lexus]",2
4,2001-10-01,mrdetailer,"Generally a 3-4 year old car range is a good one because they are basically sound. If possible get one that is still under warrenty. But if not, make sure that a trusted Maxima mechanic goes over it thoroughly. Their V-6 is one of the best rated for over 10 years. Personally I am amazed at the great strides in American Quality recently, so the Olds may not be a bad idea other.",1118,Member,"[generally, 34, year, old, car, range, good, one, basically, sound, possible, get, one, still, warrenty, make, sure, trusted, maxima, mechanic, goes, thoroughly, v6, one, best, rated, 10, years, personally, amazed, great, strides, american, quality, recently, olds, may, bad, idea]","[generally, 34, year, age, car, sustainability, good, one, basically, sound, possible, get, one, still, warrenty, make, sure, trusted, nissan, mechanic, goes, thoroughly, engine, one, best, rated, 10, years, personally, amazed, great, strides, american, reliable, recently, olds, may, bad, idea]",[nissan],1
...,...,...,...,...,...,...,...,...,...
4146,2021-02-01,kyfdx,"qbrozen said: show previous quoteskyfdx said:I'll guess they survey more Accord owners than all Jaguar models combined. How are you ever going to get meaningful data from a make that sells so few cars?Name one other organization that actually compiles data or surveys, long term. I don't think there are any. \nOh, you can't. But that's my point when folks throw old thinking around like ""Honda and Toyota are the only reliable cars."" It just isn't that simple. I would say HyunKia are just as reliable these days. Do I have proof? Nope. But nobody has proof to say otherwise. That's all I'm getting at. \nI bet CR has a lot of surveys on those two makes. You might be right, and they could probably back it up.",203030,Moderator,"[qbrozen, said, show, previous, quoteskyfdx, saidill, guess, survey, accord, owners, jaguar, models, combined, ever, going, get, meaningful, data, make, sells, carsname, one, organization, actually, compiles, data, surveys, long, term, dont, think, oh, cant, thats, point, folks, throw, old, thinking, around, like, honda, toyota, reliable, cars, isnt, simple, would, say, hyunkia, reliable, days, proof, nope, nobody, proof, say, otherwise, thats, im, getting, bet, cr, lot, surveys, two, makes, might, right, could, probably, back]","[qbrozen, said, show, previous, quoteskyfdx, saidill, guess, survey, honda, owners, jaguar, models, combined, ever, going, get, meaningful, data, make, sells, carsname, one, organization, actually, compiles, data, surveys, long, term, dont, think, oh, cant, thats, point, folks, throw, age, thinking, around, like, honda, toyota, reliable, cars, isnt, simple, would, say, hyunkia, reliable, days, proof, nope, nobody, proof, say, otherwise, thats, im, getting, bet, cr, lot, surveys, two, makes, might, right, could, probably, back]","[honda, toyota]",2
4147,2021-02-01,backy,"KamCottage said:Thank you backy. Since I first posted this question, I've come to realize, as mcdawgg said above, that I want a Toyota or Honda because of the reliability. I am just so damn used to longevity in all my cars that I think anything less would make me unhappy in the long run. I don't have the patience or money to repair stuff anymore than I have to... and since I have driven beaters into the ground, I don't care about aesthetics as much as durability. That said if no Toyota or Honda is a good fit for the husband than we will look at an Optima CPO. We're heading to Carmax today to just get in a bunch of cars and see what kind of room there is in the front driver seat for him, 6 foot 4, and me who will be the main driver. \nDid you get a car yet? If not and are still considering the Optima, JD Power just ranked the 2018 Optima the #1 mid-sized car in its latest Vehicle Dependability Study. And Kia was the highest-ranked mass-market brand overall, behind only Lexus and Por...",18946,Member,"[kamcottage, saidthank, backy, since, first, posted, question, ive, come, realize, mcdawgg, said, want, toyota, honda, reliability, damn, used, longevity, cars, think, anything, less, would, make, unhappy, long, run, dont, patience, money, repair, stuff, anymore, since, driven, beaters, ground, dont, care, aesthetics, much, durability, said, toyota, honda, good, fit, husband, look, optima, cpo, heading, carmax, today, get, bunch, cars, see, kind, room, front, driver, seat, 6, foot, 4, main, driver, get, car, yet, still, considering, optima, jd, power, ranked, 2018, optima, 1, midsized, car, latest, vehicle, dependability, study, kia, highestranked, massmarket, brand, overall, behind, lexus, porsche, fyi, 2018, sonata, 2, midsized, ...]","[kamcottage, saidthank, backy, since, first, posted, question, ive, come, realize, mcdawgg, said, want, toyota, honda, reliable, damn, used, longevity, cars, think, anything, less, would, make, unhappy, long, run, dont, patience, affordability, affordability, stuff, anymore, since, driven, beaters, ground, dont, care, visual, much, durability, said, toyota, honda, good, fit, husband, look, kia, affordability, heading, carmax, today, get, bunch, cars, see, kind, size, front, driver, seat, 6, foot, 4, main, driver, get, car, yet, still, considering, kia, jd, power, ranked, 2018, kia, 1, size, car, latest, vehicle, dependability, study, kia, highestranked, massmarket, brand, overall, behind, lexus, porsche, fyi, 2018, sonata, 2, size, ...]","[honda, kia, seat, toyota, lexus]",5
4148,2021-02-01,RayeEliza,"I am doing major research in trying to find my new car, which I am more leaning towards an SUV. Since I recently moved to Washington state and having to deal with snow, I will need a car that I can drive in snow without issues. I am also looking for somewhat good mileage and really good cargo space. I have only three cars I'm looking for that I am interested in which are the Subaru Outback, Toyota RAV4 hybrid, and the Subaru Forester. Which of these would you all recommend or if there are any others that sound just as good. The optimal price range is below $35,000.",6,Member,"[major, research, trying, find, new, car, leaning, towards, suv, since, recently, moved, washington, state, deal, snow, need, car, drive, snow, without, issues, also, looking, somewhat, good, mileage, really, good, cargo, space, three, cars, im, looking, interested, subaru, outback, toyota, rav4, hybrid, subaru, forester, would, recommend, others, sound, good, optimal, price, range, 35000]","[major, research, trying, find, new, car, leaning, towards, suv, since, recently, moved, washington, state, deal, snow, need, car, drive, snow, without, issues, also, looking, somewhat, good, sustainability, really, good, cargo, size, three, cars, im, looking, interested, subaru, outback, toyota, rav4, sustainability, subaru, forester, would, recommend, others, sound, good, optimal, affordability, sustainability, 35000]","[subaru, toyota]",2
4149,2021-02-01,mlevine,I would lean toward subaru. Need to see which model you are comfortable in. Mazda SUVs and honda SUVs also not a bad choice.,512,Member,"[would, lean, toward, subaru, need, see, model, comfortable, mazda, suvs, honda, suvs, also, bad, choice]","[would, lean, toward, subaru, need, see, model, comfort, mazda, suvs, honda, suvs, also, bad, choice]","[honda, mazda, subaru]",3


# Calculating lift scores

In [52]:
# Import checkpoint csv
df = pd.read_pickle("data_checkpoint.pkl")

In [24]:
# Calculates lift between a brand and an attribute
def calculate_lift(brand_name, attribute_name):
    
    # Used to get counts in separate table
    string_list = []
    
    # Get total number of reviews
    review_count = len(df)
    string_list.append(f"There are {review_count} reviews")

    # Count number of times brand appears across reviews
    brand_count = 0
    for review in  df.Message_words_v3:
        if brand_name in review:
            brand_count += 1
    string_list.append(f"{brand_name} appears {brand_count} times")
    
    # Get number of times attribute appears across reviews
    attribute_count = 0
    for review in df.Message_words_v3:
            if attribute_name in review:
                attribute_count += 1
    string_list.append(f"{attribute_name} appears {attribute_count} times")

    # Count number of times attribute and brand appear together
    attr_brand_count = 0
    for review in df.Message_words_v3:
        if brand_name in review and attribute_name in review:
            attr_brand_count += 1
    string_list.append(f"{attribute_name} and {brand_name} appears together {attr_brand_count} times")
    
    # Calculate lift
    lift = review_count * (attr_brand_count/(brand_count * attribute_count))
    
    return lift, string_list

In [25]:
# Define top brands and attributes as described in parts A and C/D
top_attributes = ['affordability', 'sustainability', 'size', 'driveability', 'engine']
top_brands = ['honda','toyota','nissan','volkswagen','chevrolet']

In [26]:
# Initialize dataframes to store lift and support values
df_lift = pd.DataFrame(index=top_brands, columns = top_attributes)
df_lift_counts = pd.DataFrame(index=top_brands, columns = top_attributes)

In [27]:
# Generate matrix for lift calculation
for brand in top_brands:
    for attribute in top_attributes:
        df_lift.loc[brand, attribute], df_lift_counts.loc[brand, attribute] = calculate_lift(brand, attribute)

In [28]:
df_lift

Unnamed: 0,affordability,sustainability,size,driveability,engine
honda,1.236407,1.475249,1.333386,1.285018,1.665103
toyota,1.23449,1.572307,1.41119,1.358744,1.416618
nissan,1.250663,1.334892,1.466724,1.266514,1.612803
volkswagen,1.148263,1.176009,1.306777,1.334974,1.465157
chevrolet,1.212448,1.668234,1.774826,1.719377,2.052918


In [29]:
df_lift_counts

Unnamed: 0,affordability,sustainability,size,driveability,engine
honda,"[There are 4151 reviews, honda appears 924 times, affordability appears 2042 times, affordability and honda appears together 562 times]","[There are 4151 reviews, honda appears 924 times, sustainability appears 1078 times, sustainability and honda appears together 354 times]","[There are 4151 reviews, honda appears 924 times, size appears 1159 times, size and honda appears together 344 times]","[There are 4151 reviews, honda appears 924 times, driveability appears 874 times, driveability and honda appears together 250 times]","[There are 4151 reviews, honda appears 924 times, engine appears 804 times, engine and honda appears together 298 times]"
toyota,"[There are 4151 reviews, toyota appears 769 times, affordability appears 2042 times, affordability and toyota appears together 467 times]","[There are 4151 reviews, toyota appears 769 times, sustainability appears 1078 times, sustainability and toyota appears together 314 times]","[There are 4151 reviews, toyota appears 769 times, size appears 1159 times, size and toyota appears together 303 times]","[There are 4151 reviews, toyota appears 769 times, driveability appears 874 times, driveability and toyota appears together 220 times]","[There are 4151 reviews, toyota appears 769 times, engine appears 804 times, engine and toyota appears together 211 times]"
nissan,"[There are 4151 reviews, nissan appears 525 times, affordability appears 2042 times, affordability and nissan appears together 323 times]","[There are 4151 reviews, nissan appears 525 times, sustainability appears 1078 times, sustainability and nissan appears together 182 times]","[There are 4151 reviews, nissan appears 525 times, size appears 1159 times, size and nissan appears together 215 times]","[There are 4151 reviews, nissan appears 525 times, driveability appears 874 times, driveability and nissan appears together 140 times]","[There are 4151 reviews, nissan appears 525 times, engine appears 804 times, engine and nissan appears together 164 times]"
volkswagen,"[There are 4151 reviews, volkswagen appears 370 times, affordability appears 2042 times, affordability and volkswagen appears together 209 times]","[There are 4151 reviews, volkswagen appears 370 times, sustainability appears 1078 times, sustainability and volkswagen appears together 113 times]","[There are 4151 reviews, volkswagen appears 370 times, size appears 1159 times, size and volkswagen appears together 135 times]","[There are 4151 reviews, volkswagen appears 370 times, driveability appears 874 times, driveability and volkswagen appears together 104 times]","[There are 4151 reviews, volkswagen appears 370 times, engine appears 804 times, engine and volkswagen appears together 105 times]"
chevrolet,"[There are 4151 reviews, chevrolet appears 337 times, affordability appears 2042 times, affordability and chevrolet appears together 201 times]","[There are 4151 reviews, chevrolet appears 337 times, sustainability appears 1078 times, sustainability and chevrolet appears together 146 times]","[There are 4151 reviews, chevrolet appears 337 times, size appears 1159 times, size and chevrolet appears together 167 times]","[There are 4151 reviews, chevrolet appears 337 times, driveability appears 874 times, driveability and chevrolet appears together 122 times]","[There are 4151 reviews, chevrolet appears 337 times, engine appears 804 times, engine and chevrolet appears together 134 times]"


In [32]:
#computing the lower triangle of the array
np.tril(np.ones(df_lift.shape)).astype(np.bool)[0:10,0:10]
df_lt_attr = df_lift.where(np.tril(np.ones(df_lift.shape)).astype(np.bool))
df_lt_attr


#dealing with null values 
df_lt_attr.fillna(df_lt_attr.max().max()+1, inplace=True)

#adding color coding
def color_max_white(val, max_val):
    color = 'white' if val == max_val else 'black'
    return 'color: %s' % color

def highlight_max(data, color='white'):
    attr = 'background-color: {}'.format(color)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

max_val = df_lt_attr.max().max()

def make_pretty(styler):
    styler.set_caption("Lift Ratios")
    styler.background_gradient(cmap='YlGnBu', axis=None).applymap(lambda x: color_max_white(x, max_val)).apply(highlight_max, axis=None)
    return styler

make_pretty(df_lt_attr.style)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.tril(np.ones(df_lift.shape)).astype(np.bool)[0:10,0:10]
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df_lt_attr = df_lift.where(np.tril(np.ones(df_lift.shape)).astype(np.bool))


Unnamed: 0,affordability,sustainability,size,driveability,engine
honda,1.236407,3.052918,3.052918,3.052918,3.052918
toyota,1.23449,1.572307,3.052918,3.052918,3.052918
nissan,1.250663,1.334892,1.466724,3.052918,3.052918
volkswagen,1.148263,1.176009,1.306777,1.334974,3.052918
chevrolet,1.212448,1.668234,1.774826,1.719377,2.052918


## Calculating lift from brand-attribute assignment lists
brand_reviews_1st_approach and brand_reviews_2nd_approach


In [None]:
# Get split reviews with replaced attributes
# Generates a list (1 entry per row) of dictionaries {brand_1: part of text corresponding, brand_2: part of text corresponding}
brand_reviews_1st_mapped = []

for m in df.Message_words_v3:

    brand_reviews_1st_mapped.append(get_attr_in_the_right(m, brands_list))

brand_reviews_2nd_mapped = []

for m in df.Message_words_v3:

    brand_reviews_2nd_mapped.append(get_attr_n_per_side(m,4, brands_list))

In [None]:
# Calculate split from list of dictionaries brand_reviews_1st_mapped and brand_reviews_2nd_mapped

def calculate_lift_split(brand_name, attribute_name, review_list):
    
    # Used to get counts in separate table
    string_list = []
    
    # Get total number of reviews - defined as a part of a forum post that talks about a specific brand
    # Each review is therefore a split of the entire post, the split being made in 2 different ways as seen above (get_attr_in_the_right, get_attr_n_per_side )
    
    review_count = 0
    for review in review_list: #review_list is a list of dicts, so review is a dict
        review_count += len(review)
    string_list.append(f"There are {review_count} reviews about specific brands")

    # Count number of times brand appears across split reviews
    brand_count = 0
    for review in review_list: #review_list is a list of dicts, so review is a dict
        for key in review.keys():
            if brand_name == key:
                brand_count += 1
    string_list.append(f"{brand_name} appears {brand_count} times")
    
    # Count number of times attribute appears across split reviews
    attribute_count = 0
    for review in review_list:
        for value in review.values():
            if attribute_name in value:
                attribute_count += 1
    string_list.append(f"{attribute_name} appears {attribute_count} times")

    # Count number of times attribute and brand appear together
    attr_brand_count = 0
    for review in brand_reviews_1st_mapped:
        for key, value in review.items():
            if brand_name == key and attribute_name in value:
                attr_brand_count += 1
    string_list.append(f"{attribute_name} and {brand_name} appears together {attr_brand_count} times")
   
    # Calculate lift
    lift = review_count * (attr_brand_count/(brand_count * attribute_count))
    return lift, string_list

In [None]:
# Initialize dataframes
df_lift_1st_mapped = pd.DataFrame(index=top_brands, columns = top_attributes)
df_lift_1st_counts = pd.DataFrame(index=top_brands, columns = top_attributes)
df_lift_2nd_mapped = pd.DataFrame(index=top_brands, columns = top_attributes)
df_lift_2nd_counts = pd.DataFrame(index=top_brands, columns = top_attributes)

In [None]:
# Getting lift for brand_reviews_1st_mapped
for brand in top_brands:
    for attribute in top_attributes:
        df_lift_1st_mapped.loc[brand, attribute], df_lift_1st_counts.loc[brand, attribute] = calculate_lift_split(brand, attribute, brand_reviews_1st_mapped)

In [None]:
df_lift_1st_mapped

In [None]:
df_lift_1st_counts

In [None]:
# Getting lift for brand_reviews_2nd_mapped
for brand in top_brands:
    for attribute in top_attributes:
        df_lift_2nd_mapped.loc[brand, attribute], df_lift_2nd_counts.loc[brand, attribute] = calculate_lift_split(brand, attribute, brand_reviews_2nd_mapped)

In [None]:
df_lift_2nd_mapped

In [None]:
df_lift_2nd_counts