In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None) # to read records completely
import pickle
import numpy as np
import itertools
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

## Previous data prep necessary for c) and d)

In [2]:
df = pd.read_pickle('data_clean.pkl')

In [3]:
models = pd.read_csv('models.csv', header=None, names = ['brand', 'model'])
models['brand'] = models['brand'].str.replace(r'[^\w\s]+', '', regex=True) # clean some punctuation errors (nissan.)

In [4]:
models = models[~models.brand.isin(['car', 'sedan', 'problem'])]

In [5]:
df.Message.str.contains('said').sum()

370

In [6]:
df.Message_words = df.Message_words.apply(lambda x: [i.lower() for i in x])

### Replace models with brands

In [7]:
messages = df.Message_words.to_list()

In [8]:
# numpy.where to do the replacement
messages2 = []
t = 0
for m in messages:
    for i,row in models.iterrows():
        m = np.where(m == row['model'], row['brand'], m)
        
    t+=1
    
    #print(t)
        
    messages2.append(m)
        
df['Message_words_v2'] = messages2

In [9]:
# let's find the occurences of changes
m1 = df.Message_words.to_list()
m2 = df.Message_words_v2.to_list()

c = 0
W = 0
for i in range(len(m2)):
    for j in range(len(m2[i])):
        if m1[i][j] != m2[i][j]: #word by word comparison
            c +=1
        W += 1

In [10]:
print(f'{c} changes of models')

6368 changes of models


In [11]:
df.head(10)

Unnamed: 0,Date,User_Id,Message,NumberOfPastPosts,Role,Message_words,Message_words_v2
0,2001-09-01,pat,"Need help choosing your next vehicle? Tell us your price range, are you thinking New or Used, buying or leasing, what features are must-haves, what other thoughts are on your mind, and let us give you a hand! Tell us your criteria as specifically as you can.You'll find lots of helpful folks here who can give you useful suggestions.",10421,Member,"[need, help, choosing, next, vehicle, tell, us, price, range, thinking, new, used, buying, leasing, features, musthaves, thoughts, mind, let, us, give, hand, tell, us, criteria, specifically, canyoull, find, lots, helpful, folks, give, useful, suggestions]","[need, help, choosing, next, vehicle, tell, us, price, range, thinking, new, used, buying, leasing, features, musthaves, thoughts, mind, let, us, give, hand, tell, us, criteria, specifically, canyoull, find, lots, helpful, folks, give, useful, suggestions]"
1,2001-09-01,willow25,"I'm buying a ""new"" car by the end of the year and my first choice is a Nissan Maxima. I've read that they are very reliable and hold their value well. My budget is probably going to allow for a 98-99 and I was wondering if it's worth buying a car that's almost 3-4 years old. Any suggestions or comments would help a lot.willow25",2,Member,"[im, buying, new, car, end, year, first, choice, nissan, maxima, ive, read, reliable, hold, value, well, budget, probably, going, allow, 9899, wondering, worth, buying, car, thats, almost, 34, years, old, suggestions, comments, would, help, lotwillow25]","[im, buying, new, car, end, year, first, choice, nissan, nissan, ive, read, reliable, hold, value, well, budget, probably, going, allow, 9899, wondering, worth, buying, car, thats, almost, 34, years, old, suggestions, comments, would, help, lotwillow25]"
2,2001-10-01,dindak,"Buy a 2001/ 2002 Oldsmobile Intrigue. It's one of the best sedans on the market with great handling and one of the best V6 engines around, a 3.5L DOHC producing 215HP. The deals should be great this year as it's the last on for Intrigue and the warrany on them has been extended to 5 years. The car is recommended by most magazines (including Consumer Reports), but is unfortunately over looked by most consumers. It is the sleeper of all sleepers on the market in my opinion.",6632,Member,"[buy, 2001, 2002, oldsmobile, intrigue, one, best, sedans, market, great, handling, one, best, v6, engines, around, 35l, dohc, producing, 215hp, deals, great, year, last, intrigue, warrany, extended, 5, years, car, recommended, magazines, including, consumer, reports, unfortunately, looked, consumers, sleeper, sleepers, market, opinion]","[buy, 2001, 2002, oldsmobile, intrigue, one, best, sedans, market, great, handling, one, best, v6, engines, around, 35l, dohc, producing, 215hp, deals, great, year, last, intrigue, warrany, extended, 5, years, car, recommended, magazines, including, consumer, reports, unfortunately, looked, consumers, sleeper, sleepers, market, opinion]"
3,2001-10-01,peteri1,"Am looking at 1 of 4 choices to replace my 99GS400. I've read everything here and on other forums and have seen much conflicting info:4 options:Lexus LS430 with custom/luxury packageInfiniti Q45 with premium pckgAudi A-6 4.2 loaded Audi A-8 not stripped, but not loadedChicago winters--had Blizzaks on the GS--worked fine. My 95Q was pig in snow, but I'm sure 2002Q is a world away from that. I've heard Audi has some QC problems, but I like their maintenance being included. I'm a happy Lexus driver. The Q appears to offer the most for the $$. Will lease and in this economy, I would think I could command an attractive package. Can I go wrong with any of the 4? Am I missing something?",21,Member,"[looking, 1, 4, choices, replace, 99gs400, ive, read, everything, forums, seen, much, conflicting, info4, optionslexus, ls430, customluxury, packageinfiniti, q45, premium, pckgaudi, a6, 42, loaded, audi, a8, stripped, loadedchicago, wintershad, blizzaks, gsworked, fine, 95q, pig, snow, im, sure, 2002q, world, away, ive, heard, audi, qc, problems, like, maintenance, included, im, happy, lexus, driver, q, appears, offer, lease, economy, would, think, could, command, attractive, package, go, wrong, 4, missing, something]","[looking, 1, 4, choices, replace, 99gs400, ive, read, everything, forums, seen, much, conflicting, info4, optionslexus, ls430, customluxury, packageinfiniti, q45, premium, pckgaudi, audi, 42, loaded, audi, audi, stripped, loadedchicago, wintershad, blizzaks, gsworked, fine, 95q, pig, snow, im, sure, 2002q, world, away, ive, heard, audi, qc, problems, like, maintenance, included, im, happy, lexus, driver, q, appears, offer, lease, economy, would, think, could, command, attractive, package, go, wrong, 4, missing, something]"
4,2001-10-01,mrdetailer,"Generally a 3-4 year old car range is a good one because they are basically sound. If possible get one that is still under warrenty. But if not, make sure that a trusted Maxima mechanic goes over it thoroughly. Their V-6 is one of the best rated for over 10 years. Personally I am amazed at the great strides in American Quality recently, so the Olds may not be a bad idea other.",1118,Member,"[generally, 34, year, old, car, range, good, one, basically, sound, possible, get, one, still, warrenty, make, sure, trusted, maxima, mechanic, goes, thoroughly, v6, one, best, rated, 10, years, personally, amazed, great, strides, american, quality, recently, olds, may, bad, idea]","[generally, 34, year, old, car, range, good, one, basically, sound, possible, get, one, still, warrenty, make, sure, trusted, nissan, mechanic, goes, thoroughly, v6, one, best, rated, 10, years, personally, amazed, great, strides, american, quality, recently, olds, may, bad, idea]"
5,2001-10-01,dindak,Maxima and Intrigue are both excellent choices for midsize sedans.,6632,Member,"[maxima, intrigue, excellent, choices, midsize, sedans]","[nissan, intrigue, excellent, choices, midsize, sedans]"
6,2001-10-01,canadiancl,"If I had that kind of budget, I would go for the A6 4.2 - - best combination of sport and luxury amongst the 4 choices.",1078,Member,"[kind, budget, would, go, a6, 42, best, combination, sport, luxury, amongst, 4, choices]","[kind, budget, would, go, audi, 42, best, combination, sport, luxury, amongst, 4, choices]"
7,2001-10-01,qbrozen,"Actually, I was about to agree with the A6, but I just looked at the specs here on Edmunds. Both the A6 and A8 are 4000 lbs, and considering the A8 gets a bump in HP over the A6, I don't think performance difference is an issue. So, considering that and the fact that the A6 would possibly be to small for you based on the rest of your list, I say go for the A8. I think you'll find the AWD to be the best addition to a car you could ever have hoped for.",30400,Member,"[actually, agree, a6, looked, specs, edmunds, a6, a8, 4000, lbs, considering, a8, gets, bump, hp, a6, dont, think, performance, difference, issue, considering, fact, a6, would, possibly, small, based, rest, list, say, go, a8, think, youll, find, awd, best, addition, car, could, ever, hoped]","[actually, agree, audi, looked, specs, edmunds, audi, audi, 4000, lbs, considering, audi, gets, bump, hp, audi, dont, think, performance, difference, issue, considering, fact, audi, would, possibly, small, based, rest, list, say, go, audi, think, youll, find, awd, best, addition, car, could, ever, hoped]"
8,2001-10-01,black_tulip,I'd pass Intrigue only for one reason: crash test results are terrible...,438,Member,"[id, pass, intrigue, one, reason, crash, test, results, terrible]","[id, pass, intrigue, one, reason, crash, test, results, terrible]"
9,2001-10-01,paulo3,I would recommend the Honda Accord or Toyota Camry as my choices for a sedan. The new Nissan Altima looks impressive but I would wait until the reliability of the 2002 is proven.The new BMW 525 is another nice car and is rated highly by many auto publications.,113,Member,"[would, recommend, honda, accord, toyota, camry, choices, sedan, new, nissan, altima, looks, impressive, would, wait, reliability, 2002, proventhe, new, bmw, 525, another, nice, car, rated, highly, many, auto, publications]","[would, recommend, honda, honda, toyota, toyota, choices, sedan, new, nissan, nissan, looks, impressive, would, wait, reliability, 2002, proventhe, new, bmw, 525, another, nice, car, rated, highly, many, auto, publications]"


In [12]:
#Get a unique list of brands
brands_list = models.brand.drop_duplicates().to_list()

In [13]:
#Removing duplicates in lists while keeping order
#import itertools
def remove_consecutive_duplicate (text):
    return np.array([i for i, j in itertools.groupby(text)])

In [14]:
df["Message_words_v3"] = df["Message_words_v2"].apply(remove_consecutive_duplicate)

In [15]:
type(df["Message_words_v2"][0])

numpy.ndarray

In [16]:
type(df["Message_words_v3"][0])

numpy.ndarray

In [17]:
df.head(10)

Unnamed: 0,Date,User_Id,Message,NumberOfPastPosts,Role,Message_words,Message_words_v2,Message_words_v3
0,2001-09-01,pat,"Need help choosing your next vehicle? Tell us your price range, are you thinking New or Used, buying or leasing, what features are must-haves, what other thoughts are on your mind, and let us give you a hand! Tell us your criteria as specifically as you can.You'll find lots of helpful folks here who can give you useful suggestions.",10421,Member,"[need, help, choosing, next, vehicle, tell, us, price, range, thinking, new, used, buying, leasing, features, musthaves, thoughts, mind, let, us, give, hand, tell, us, criteria, specifically, canyoull, find, lots, helpful, folks, give, useful, suggestions]","[need, help, choosing, next, vehicle, tell, us, price, range, thinking, new, used, buying, leasing, features, musthaves, thoughts, mind, let, us, give, hand, tell, us, criteria, specifically, canyoull, find, lots, helpful, folks, give, useful, suggestions]","[need, help, choosing, next, vehicle, tell, us, price, range, thinking, new, used, buying, leasing, features, musthaves, thoughts, mind, let, us, give, hand, tell, us, criteria, specifically, canyoull, find, lots, helpful, folks, give, useful, suggestions]"
1,2001-09-01,willow25,"I'm buying a ""new"" car by the end of the year and my first choice is a Nissan Maxima. I've read that they are very reliable and hold their value well. My budget is probably going to allow for a 98-99 and I was wondering if it's worth buying a car that's almost 3-4 years old. Any suggestions or comments would help a lot.willow25",2,Member,"[im, buying, new, car, end, year, first, choice, nissan, maxima, ive, read, reliable, hold, value, well, budget, probably, going, allow, 9899, wondering, worth, buying, car, thats, almost, 34, years, old, suggestions, comments, would, help, lotwillow25]","[im, buying, new, car, end, year, first, choice, nissan, nissan, ive, read, reliable, hold, value, well, budget, probably, going, allow, 9899, wondering, worth, buying, car, thats, almost, 34, years, old, suggestions, comments, would, help, lotwillow25]","[im, buying, new, car, end, year, first, choice, nissan, ive, read, reliable, hold, value, well, budget, probably, going, allow, 9899, wondering, worth, buying, car, thats, almost, 34, years, old, suggestions, comments, would, help, lotwillow25]"
2,2001-10-01,dindak,"Buy a 2001/ 2002 Oldsmobile Intrigue. It's one of the best sedans on the market with great handling and one of the best V6 engines around, a 3.5L DOHC producing 215HP. The deals should be great this year as it's the last on for Intrigue and the warrany on them has been extended to 5 years. The car is recommended by most magazines (including Consumer Reports), but is unfortunately over looked by most consumers. It is the sleeper of all sleepers on the market in my opinion.",6632,Member,"[buy, 2001, 2002, oldsmobile, intrigue, one, best, sedans, market, great, handling, one, best, v6, engines, around, 35l, dohc, producing, 215hp, deals, great, year, last, intrigue, warrany, extended, 5, years, car, recommended, magazines, including, consumer, reports, unfortunately, looked, consumers, sleeper, sleepers, market, opinion]","[buy, 2001, 2002, oldsmobile, intrigue, one, best, sedans, market, great, handling, one, best, v6, engines, around, 35l, dohc, producing, 215hp, deals, great, year, last, intrigue, warrany, extended, 5, years, car, recommended, magazines, including, consumer, reports, unfortunately, looked, consumers, sleeper, sleepers, market, opinion]","[buy, 2001, 2002, oldsmobile, intrigue, one, best, sedans, market, great, handling, one, best, v6, engines, around, 35l, dohc, producing, 215hp, deals, great, year, last, intrigue, warrany, extended, 5, years, car, recommended, magazines, including, consumer, reports, unfortunately, looked, consumers, sleeper, sleepers, market, opinion]"
3,2001-10-01,peteri1,"Am looking at 1 of 4 choices to replace my 99GS400. I've read everything here and on other forums and have seen much conflicting info:4 options:Lexus LS430 with custom/luxury packageInfiniti Q45 with premium pckgAudi A-6 4.2 loaded Audi A-8 not stripped, but not loadedChicago winters--had Blizzaks on the GS--worked fine. My 95Q was pig in snow, but I'm sure 2002Q is a world away from that. I've heard Audi has some QC problems, but I like their maintenance being included. I'm a happy Lexus driver. The Q appears to offer the most for the $$. Will lease and in this economy, I would think I could command an attractive package. Can I go wrong with any of the 4? Am I missing something?",21,Member,"[looking, 1, 4, choices, replace, 99gs400, ive, read, everything, forums, seen, much, conflicting, info4, optionslexus, ls430, customluxury, packageinfiniti, q45, premium, pckgaudi, a6, 42, loaded, audi, a8, stripped, loadedchicago, wintershad, blizzaks, gsworked, fine, 95q, pig, snow, im, sure, 2002q, world, away, ive, heard, audi, qc, problems, like, maintenance, included, im, happy, lexus, driver, q, appears, offer, lease, economy, would, think, could, command, attractive, package, go, wrong, 4, missing, something]","[looking, 1, 4, choices, replace, 99gs400, ive, read, everything, forums, seen, much, conflicting, info4, optionslexus, ls430, customluxury, packageinfiniti, q45, premium, pckgaudi, audi, 42, loaded, audi, audi, stripped, loadedchicago, wintershad, blizzaks, gsworked, fine, 95q, pig, snow, im, sure, 2002q, world, away, ive, heard, audi, qc, problems, like, maintenance, included, im, happy, lexus, driver, q, appears, offer, lease, economy, would, think, could, command, attractive, package, go, wrong, 4, missing, something]","[looking, 1, 4, choices, replace, 99gs400, ive, read, everything, forums, seen, much, conflicting, info4, optionslexus, ls430, customluxury, packageinfiniti, q45, premium, pckgaudi, audi, 42, loaded, audi, stripped, loadedchicago, wintershad, blizzaks, gsworked, fine, 95q, pig, snow, im, sure, 2002q, world, away, ive, heard, audi, qc, problems, like, maintenance, included, im, happy, lexus, driver, q, appears, offer, lease, economy, would, think, could, command, attractive, package, go, wrong, 4, missing, something]"
4,2001-10-01,mrdetailer,"Generally a 3-4 year old car range is a good one because they are basically sound. If possible get one that is still under warrenty. But if not, make sure that a trusted Maxima mechanic goes over it thoroughly. Their V-6 is one of the best rated for over 10 years. Personally I am amazed at the great strides in American Quality recently, so the Olds may not be a bad idea other.",1118,Member,"[generally, 34, year, old, car, range, good, one, basically, sound, possible, get, one, still, warrenty, make, sure, trusted, maxima, mechanic, goes, thoroughly, v6, one, best, rated, 10, years, personally, amazed, great, strides, american, quality, recently, olds, may, bad, idea]","[generally, 34, year, old, car, range, good, one, basically, sound, possible, get, one, still, warrenty, make, sure, trusted, nissan, mechanic, goes, thoroughly, v6, one, best, rated, 10, years, personally, amazed, great, strides, american, quality, recently, olds, may, bad, idea]","[generally, 34, year, old, car, range, good, one, basically, sound, possible, get, one, still, warrenty, make, sure, trusted, nissan, mechanic, goes, thoroughly, v6, one, best, rated, 10, years, personally, amazed, great, strides, american, quality, recently, olds, may, bad, idea]"
5,2001-10-01,dindak,Maxima and Intrigue are both excellent choices for midsize sedans.,6632,Member,"[maxima, intrigue, excellent, choices, midsize, sedans]","[nissan, intrigue, excellent, choices, midsize, sedans]","[nissan, intrigue, excellent, choices, midsize, sedans]"
6,2001-10-01,canadiancl,"If I had that kind of budget, I would go for the A6 4.2 - - best combination of sport and luxury amongst the 4 choices.",1078,Member,"[kind, budget, would, go, a6, 42, best, combination, sport, luxury, amongst, 4, choices]","[kind, budget, would, go, audi, 42, best, combination, sport, luxury, amongst, 4, choices]","[kind, budget, would, go, audi, 42, best, combination, sport, luxury, amongst, 4, choices]"
7,2001-10-01,qbrozen,"Actually, I was about to agree with the A6, but I just looked at the specs here on Edmunds. Both the A6 and A8 are 4000 lbs, and considering the A8 gets a bump in HP over the A6, I don't think performance difference is an issue. So, considering that and the fact that the A6 would possibly be to small for you based on the rest of your list, I say go for the A8. I think you'll find the AWD to be the best addition to a car you could ever have hoped for.",30400,Member,"[actually, agree, a6, looked, specs, edmunds, a6, a8, 4000, lbs, considering, a8, gets, bump, hp, a6, dont, think, performance, difference, issue, considering, fact, a6, would, possibly, small, based, rest, list, say, go, a8, think, youll, find, awd, best, addition, car, could, ever, hoped]","[actually, agree, audi, looked, specs, edmunds, audi, audi, 4000, lbs, considering, audi, gets, bump, hp, audi, dont, think, performance, difference, issue, considering, fact, audi, would, possibly, small, based, rest, list, say, go, audi, think, youll, find, awd, best, addition, car, could, ever, hoped]","[actually, agree, audi, looked, specs, edmunds, audi, 4000, lbs, considering, audi, gets, bump, hp, audi, dont, think, performance, difference, issue, considering, fact, audi, would, possibly, small, based, rest, list, say, go, audi, think, youll, find, awd, best, addition, car, could, ever, hoped]"
8,2001-10-01,black_tulip,I'd pass Intrigue only for one reason: crash test results are terrible...,438,Member,"[id, pass, intrigue, one, reason, crash, test, results, terrible]","[id, pass, intrigue, one, reason, crash, test, results, terrible]","[id, pass, intrigue, one, reason, crash, test, results, terrible]"
9,2001-10-01,paulo3,I would recommend the Honda Accord or Toyota Camry as my choices for a sedan. The new Nissan Altima looks impressive but I would wait until the reliability of the 2002 is proven.The new BMW 525 is another nice car and is rated highly by many auto publications.,113,Member,"[would, recommend, honda, accord, toyota, camry, choices, sedan, new, nissan, altima, looks, impressive, would, wait, reliability, 2002, proventhe, new, bmw, 525, another, nice, car, rated, highly, many, auto, publications]","[would, recommend, honda, honda, toyota, toyota, choices, sedan, new, nissan, nissan, looks, impressive, would, wait, reliability, 2002, proventhe, new, bmw, 525, another, nice, car, rated, highly, many, auto, publications]","[would, recommend, honda, toyota, choices, sedan, new, nissan, looks, impressive, would, wait, reliability, 2002, proventhe, new, bmw, 525, another, nice, car, rated, highly, many, auto, publications]"


In [18]:
#Removing duplicate brands in lists but only

## Link attributes to models

In [19]:
# first lets count brand mentions in each record

In [20]:
messages = df.Message_words_v3.to_list()

In [21]:
brands_in_message = []
for m in messages:
    
    brands_mentioned = []
    for brand in brands_list:
        if len(np.where(m == brand)[0]) >0:
            brands_mentioned.append(brand)
            
    brands_in_message.append(brands_mentioned)
        
    

  if len(np.where(m == brand)[0]) >0:


In [22]:
df['Brands_in_message'] = brands_in_message

In [23]:
df['Qty_brands'] = df.Brands_in_message.apply(lambda x: len(x))

In [24]:
df['Qty_brands'].value_counts()

# Most messages discuss a single brand
# Also common to compare 2 brands
# 727 messages don't mention any brand (remove them from analysis?)
# Passing a certain treshold (5+), I think it would be better to remove comments since it will be hard to figure out which
# attribute belongs to each brand

0     1291
1     1215
2      755
3      459
4      219
5      109
6       54
7       24
8       10
9        6
11       4
10       3
16       1
13       1
Name: Qty_brands, dtype: int64

In [25]:
df.loc[df.Qty_brands == 0, 'Message_words_v3'].head(10) # to review records with no brand mentions
# most of them are for seeking advice

0                                                                                       [need, help, choosing, next, vehicle, tell, us, price, range, thinking, new, used, buying, leasing, features, musthaves, thoughts, mind, let, us, give, hand, tell, us, criteria, specifically, canyoull, find, lots, helpful, folks, give, useful, suggestions]
2     [buy, 2001, 2002, oldsmobile, intrigue, one, best, sedans, market, great, handling, one, best, v6, engines, around, 35l, dohc, producing, 215hp, deals, great, year, last, intrigue, warrany, extended, 5, years, car, recommended, magazines, including, consumer, reports, unfortunately, looked, consumers, sleeper, sleepers, market, opinion]
8                                                                                                                                                                                                                                                                                      [id, pass, intrigue, one, reaso

In [26]:
m = df.Message_words_v3[9] #record 9 is a good exaple
brand_dic = {}
for brand in brands_list:
    ix = np.where(m == brand)[0]
    if len(ix) > 0:
        brand_dic[brand] = ix[0]
        

brand_dic = dict(sorted(brand_dic.items(), key=lambda x:x[1]))

brand_list = list(brand_dic.values()) + [len(m)]

for i, tup in enumerate(brand_dic):
    print(brand_dic[tup])
    print(tup)
    
    if i == 0:
        
        print(m[0:brand_list[1]])
    else:
        print(m[brand_list[i]:brand_list[i+1]])
        
        

2
honda
['would' 'recommend' 'honda']
3
toyota
['toyota' 'choices' 'sedan' 'new']
7
nissan
['nissan' 'looks' 'impressive' 'would' 'wait' 'reliability' '2002'
 'proventhe' 'new']
16
bmw
['bmw' '525' 'another' 'nice' 'car' 'rated' 'highly' 'many' 'auto'
 'publications']


In [27]:
# now we run it for all records

brand_reviews = []

for m in df.Message_words_v3:

    brand_dic = {}
    for brand in brands_list:
        ix = np.where(m == brand)[0]
        if len(ix) > 0:
            brand_dic[brand] = ix[0]

    brand_dic = dict(sorted(brand_dic.items(), key=lambda x:x[1]))

    brand_list = list(brand_dic.values()) + [len(m)]


    review={}
    for i, tup in enumerate(brand_dic):
        #print(brand_dic[tup])
        #print(tup)

        if i == 0:
            
            review[tup] = m[0:brand_list[1]]
        else:
            review[tup] = m[brand_list[i]:brand_list[i+1]]
            
    brand_reviews.append(review)

# generates a list (1 entry per row) of dictionaries {brand: part of text corresponding}

  ix = np.where(m == brand)[0]


In [28]:
len(brand_reviews) # each record becomes a single dictionary with all its brand mentions and linked words

4151

In [29]:
brand_reviews[9]

{'honda': array(['would', 'recommend', 'honda'], dtype='<U12'),
 'toyota': array(['toyota', 'choices', 'sedan', 'new'], dtype='<U12'),
 'nissan': array(['nissan', 'looks', 'impressive', 'would', 'wait', 'reliability',
        '2002', 'proventhe', 'new'], dtype='<U12'),
 'bmw': array(['bmw', '525', 'another', 'nice', 'car', 'rated', 'highly', 'many',
        'auto', 'publications'], dtype='<U12')}

In [30]:
# now we need to concatenate all reviews for each brand into a single key in a dictionary
review_accum = {i:[] for i in brands_list}

In [31]:
for brand in brands_list:
    for review in brand_reviews:
        for single_review in review:
            #print(review[single_review])
            if single_review==brand:
                review_accum[brand].extend(review[single_review])

In [32]:
## from here, we are ready to start counting attributes per brand
## we also need to consider replacing some similar attributes before doing the final count 
## I will do one example with Audi but it could become a function and run in a loop to all the brands 

In [33]:
# first lets discover popular attributes regardless of brands

all_attributes = []

for i in list(review_accum.values()):
    all_attributes.extend(i)
    
words = pd.Series(all_attributes)

words.value_counts().head(40) # this shows the frequencies of each word
# the most "rustic" approach would be to literally print them in order and take note of the ones that can be considered "attributes"

car           3152
honda         1746
like          1634
would         1600
get           1453
new           1394
toyota        1341
cars          1324
good          1249
one           1178
also           964
nissan         963
im             943
dont           883
drive          880
miles          823
much           808
want           802
better         796
years          776
used           766
think          750
volkswagen     738
price          737
looking        714
well           699
really         669
buy            583
know           579
go             560
even           560
chevrolet      556
seat           551
need           550
great          533
year           532
could          529
ive            495
driving        487
might          481
dtype: int64

In [34]:
words[~words.isin(brands_list)].value_counts().head(60)


car            3152
like           1634
would          1600
get            1453
new            1394
cars           1324
good           1249
one            1178
also            964
im              943
dont            883
drive           880
miles           823
much            808
want            802
better          796
years           776
used            766
think           750
price           737
looking         714
well            699
really          669
buy             583
know            579
even            560
go              560
need            550
great           533
year            532
could           529
ive             495
driving         487
might           481
time            479
look            466
lot             456
best            454
engine          454
something       446
3               445
reliability     442
id              441
nice            439
going           434
vehicle         433
civic           431
old             424
probably        419
still           409


In [35]:
words_count = words.value_counts()
words_nobrands_count = words[~words.isin(brands_list)].value_counts()

### Stemming

In [36]:
#Stemming
stemmer = PorterStemmer()

In [37]:
def stem_list (text):
    return pd.Series([stemmer.stem(w) for w in text])

In [38]:
stemmed_words = stem_list(words)
len(stemmed_words)

191416

In [39]:
stemmed_words.value_counts().head(60)

car           4477
like          1924
get           1854
honda         1746
would         1600
look          1585
drive         1477
new           1394
toyota        1341
year          1308
one           1270
good          1251
use           1023
go             994
want           993
also           964
nissan         963
im             943
price          925
mile           900
think          890
dont           883
buy            848
much           808
reliabl        806
better         799
volkswagen     741
need           738
well           699
vehicl         690
realli         669
know           635
model          633
time           589
seat           581
make           581
lot            578
even           562
chevrolet      558
engin          557
great          533
could          529
dealer         506
ive            495
consid         485
might          481
sedan          463
nice           459
civic          459
best           455
someth         446
3              445
audi        

In [40]:
stemmed_words[~stemmed_words.isin(brands_list)].value_counts().head(60)


car        4477
like       1924
get        1854
would      1600
look       1585
drive      1477
new        1394
year       1308
one        1270
good       1251
use        1023
go          994
want        993
also        964
im          943
price       925
mile        900
think       890
dont        883
buy         848
much        808
reliabl     806
better      799
need        738
well        699
vehicl      690
realli      669
know        635
model       633
time        589
make        581
lot         578
even        562
engin       557
great       533
could       529
dealer      506
ive         495
consid      485
might       481
sedan       463
nice        459
civic       459
best        455
someth      446
3           445
id          442
leas        442
old         441
find        422
option      422
probabl     420
test        419
seem        415
back        409
4           409
still       409
cost        403
less        402
around      396
dtype: int64

In [41]:
stemwords_count = stemmed_words.value_counts()
stemwords_nobrands_count = stemmed_words[~stemmed_words.isin(brands_list)].value_counts()

In [42]:
#Export value count series to a single multi-sheet Excel
#pip install xlsxwriter
writer = pd.ExcelWriter('attribute_count.xlsx', engine='xlsxwriter')


In [43]:
words_count.to_excel(writer, sheet_name='words_valuecounts')
words_nobrands_count.to_excel(writer, sheet_name='words_nobrands_valuecounts')
stemwords_count.to_excel(writer, sheet_name='stemwords_valuecounts')
stemwords_nobrands_count.to_excel(writer, sheet_name='stemwords_nobrands_valuecounts')

In [44]:
writer.save()


### TF-IDF

In [53]:
df

Unnamed: 0,Date,User_Id,Message,NumberOfPastPosts,Role,Message_words,Message_words_v2,Message_words_v3,Brands_in_message,Qty_brands
0,2001-09-01,pat,"Need help choosing your next vehicle? Tell us your price range, are you thinking New or Used, buying or leasing, what features are must-haves, what other thoughts are on your mind, and let us give you a hand! Tell us your criteria as specifically as you can.You'll find lots of helpful folks here who can give you useful suggestions.",10421,Member,"[need, help, choosing, next, vehicle, tell, us, price, range, thinking, new, used, buying, leasing, features, musthaves, thoughts, mind, let, us, give, hand, tell, us, criteria, specifically, canyoull, find, lots, helpful, folks, give, useful, suggestions]","[need, help, choosing, next, vehicle, tell, us, price, range, thinking, new, used, buying, leasing, features, musthaves, thoughts, mind, let, us, give, hand, tell, us, criteria, specifically, canyoull, find, lots, helpful, folks, give, useful, suggestions]","[need, help, choosing, next, vehicle, tell, us, price, range, thinking, new, used, buying, leasing, features, musthaves, thoughts, mind, let, us, give, hand, tell, us, criteria, specifically, canyoull, find, lots, helpful, folks, give, useful, suggestions]",[],0
1,2001-09-01,willow25,"I'm buying a ""new"" car by the end of the year and my first choice is a Nissan Maxima. I've read that they are very reliable and hold their value well. My budget is probably going to allow for a 98-99 and I was wondering if it's worth buying a car that's almost 3-4 years old. Any suggestions or comments would help a lot.willow25",2,Member,"[im, buying, new, car, end, year, first, choice, nissan, maxima, ive, read, reliable, hold, value, well, budget, probably, going, allow, 9899, wondering, worth, buying, car, thats, almost, 34, years, old, suggestions, comments, would, help, lotwillow25]","[im, buying, new, car, end, year, first, choice, nissan, nissan, ive, read, reliable, hold, value, well, budget, probably, going, allow, 9899, wondering, worth, buying, car, thats, almost, 34, years, old, suggestions, comments, would, help, lotwillow25]","[im, buying, new, car, end, year, first, choice, nissan, ive, read, reliable, hold, value, well, budget, probably, going, allow, 9899, wondering, worth, buying, car, thats, almost, 34, years, old, suggestions, comments, would, help, lotwillow25]",[nissan],1
2,2001-10-01,dindak,"Buy a 2001/ 2002 Oldsmobile Intrigue. It's one of the best sedans on the market with great handling and one of the best V6 engines around, a 3.5L DOHC producing 215HP. The deals should be great this year as it's the last on for Intrigue and the warrany on them has been extended to 5 years. The car is recommended by most magazines (including Consumer Reports), but is unfortunately over looked by most consumers. It is the sleeper of all sleepers on the market in my opinion.",6632,Member,"[buy, 2001, 2002, oldsmobile, intrigue, one, best, sedans, market, great, handling, one, best, v6, engines, around, 35l, dohc, producing, 215hp, deals, great, year, last, intrigue, warrany, extended, 5, years, car, recommended, magazines, including, consumer, reports, unfortunately, looked, consumers, sleeper, sleepers, market, opinion]","[buy, 2001, 2002, oldsmobile, intrigue, one, best, sedans, market, great, handling, one, best, v6, engines, around, 35l, dohc, producing, 215hp, deals, great, year, last, intrigue, warrany, extended, 5, years, car, recommended, magazines, including, consumer, reports, unfortunately, looked, consumers, sleeper, sleepers, market, opinion]","[buy, 2001, 2002, oldsmobile, intrigue, one, best, sedans, market, great, handling, one, best, v6, engines, around, 35l, dohc, producing, 215hp, deals, great, year, last, intrigue, warrany, extended, 5, years, car, recommended, magazines, including, consumer, reports, unfortunately, looked, consumers, sleeper, sleepers, market, opinion]",[],0
3,2001-10-01,peteri1,"Am looking at 1 of 4 choices to replace my 99GS400. I've read everything here and on other forums and have seen much conflicting info:4 options:Lexus LS430 with custom/luxury packageInfiniti Q45 with premium pckgAudi A-6 4.2 loaded Audi A-8 not stripped, but not loadedChicago winters--had Blizzaks on the GS--worked fine. My 95Q was pig in snow, but I'm sure 2002Q is a world away from that. I've heard Audi has some QC problems, but I like their maintenance being included. I'm a happy Lexus driver. The Q appears to offer the most for the $$. Will lease and in this economy, I would think I could command an attractive package. Can I go wrong with any of the 4? Am I missing something?",21,Member,"[looking, 1, 4, choices, replace, 99gs400, ive, read, everything, forums, seen, much, conflicting, info4, optionslexus, ls430, customluxury, packageinfiniti, q45, premium, pckgaudi, a6, 42, loaded, audi, a8, stripped, loadedchicago, wintershad, blizzaks, gsworked, fine, 95q, pig, snow, im, sure, 2002q, world, away, ive, heard, audi, qc, problems, like, maintenance, included, im, happy, lexus, driver, q, appears, offer, lease, economy, would, think, could, command, attractive, package, go, wrong, 4, missing, something]","[looking, 1, 4, choices, replace, 99gs400, ive, read, everything, forums, seen, much, conflicting, info4, optionslexus, ls430, customluxury, packageinfiniti, q45, premium, pckgaudi, audi, 42, loaded, audi, audi, stripped, loadedchicago, wintershad, blizzaks, gsworked, fine, 95q, pig, snow, im, sure, 2002q, world, away, ive, heard, audi, qc, problems, like, maintenance, included, im, happy, lexus, driver, q, appears, offer, lease, economy, would, think, could, command, attractive, package, go, wrong, 4, missing, something]","[looking, 1, 4, choices, replace, 99gs400, ive, read, everything, forums, seen, much, conflicting, info4, optionslexus, ls430, customluxury, packageinfiniti, q45, premium, pckgaudi, audi, 42, loaded, audi, stripped, loadedchicago, wintershad, blizzaks, gsworked, fine, 95q, pig, snow, im, sure, 2002q, world, away, ive, heard, audi, qc, problems, like, maintenance, included, im, happy, lexus, driver, q, appears, offer, lease, economy, would, think, could, command, attractive, package, go, wrong, 4, missing, something]",[audi],1
4,2001-10-01,mrdetailer,"Generally a 3-4 year old car range is a good one because they are basically sound. If possible get one that is still under warrenty. But if not, make sure that a trusted Maxima mechanic goes over it thoroughly. Their V-6 is one of the best rated for over 10 years. Personally I am amazed at the great strides in American Quality recently, so the Olds may not be a bad idea other.",1118,Member,"[generally, 34, year, old, car, range, good, one, basically, sound, possible, get, one, still, warrenty, make, sure, trusted, maxima, mechanic, goes, thoroughly, v6, one, best, rated, 10, years, personally, amazed, great, strides, american, quality, recently, olds, may, bad, idea]","[generally, 34, year, old, car, range, good, one, basically, sound, possible, get, one, still, warrenty, make, sure, trusted, nissan, mechanic, goes, thoroughly, v6, one, best, rated, 10, years, personally, amazed, great, strides, american, quality, recently, olds, may, bad, idea]","[generally, 34, year, old, car, range, good, one, basically, sound, possible, get, one, still, warrenty, make, sure, trusted, nissan, mechanic, goes, thoroughly, v6, one, best, rated, 10, years, personally, amazed, great, strides, american, quality, recently, olds, may, bad, idea]",[nissan],1
...,...,...,...,...,...,...,...,...,...,...
4146,2021-02-01,kyfdx,"qbrozen said: show previous quoteskyfdx said:I'll guess they survey more Accord owners than all Jaguar models combined. How are you ever going to get meaningful data from a make that sells so few cars?Name one other organization that actually compiles data or surveys, long term. I don't think there are any. \nOh, you can't. But that's my point when folks throw old thinking around like ""Honda and Toyota are the only reliable cars."" It just isn't that simple. I would say HyunKia are just as reliable these days. Do I have proof? Nope. But nobody has proof to say otherwise. That's all I'm getting at. \nI bet CR has a lot of surveys on those two makes. You might be right, and they could probably back it up.",203030,Moderator,"[qbrozen, said, show, previous, quoteskyfdx, saidill, guess, survey, accord, owners, jaguar, models, combined, ever, going, get, meaningful, data, make, sells, carsname, one, organization, actually, compiles, data, surveys, long, term, dont, think, oh, cant, thats, point, folks, throw, old, thinking, around, like, honda, toyota, reliable, cars, isnt, simple, would, say, hyunkia, reliable, days, proof, nope, nobody, proof, say, otherwise, thats, im, getting, bet, cr, lot, surveys, two, makes, might, right, could, probably, back]","[qbrozen, said, show, previous, quoteskyfdx, saidill, guess, survey, honda, owners, jaguar, models, combined, ever, going, get, meaningful, data, make, sells, carsname, one, organization, actually, compiles, data, surveys, long, term, dont, think, oh, cant, thats, point, folks, throw, old, thinking, around, like, honda, toyota, reliable, cars, isnt, simple, would, say, hyunkia, reliable, days, proof, nope, nobody, proof, say, otherwise, thats, im, getting, bet, cr, lot, surveys, two, makes, might, right, could, probably, back]","[qbrozen, said, show, previous, quoteskyfdx, saidill, guess, survey, honda, owners, jaguar, models, combined, ever, going, get, meaningful, data, make, sells, carsname, one, organization, actually, compiles, data, surveys, long, term, dont, think, oh, cant, thats, point, folks, throw, old, thinking, around, like, honda, toyota, reliable, cars, isnt, simple, would, say, hyunkia, reliable, days, proof, nope, nobody, proof, say, otherwise, thats, im, getting, bet, cr, lot, surveys, two, makes, might, right, could, probably, back]","[honda, toyota]",2
4147,2021-02-01,backy,"KamCottage said:Thank you backy. Since I first posted this question, I've come to realize, as mcdawgg said above, that I want a Toyota or Honda because of the reliability. I am just so damn used to longevity in all my cars that I think anything less would make me unhappy in the long run. I don't have the patience or money to repair stuff anymore than I have to... and since I have driven beaters into the ground, I don't care about aesthetics as much as durability. That said if no Toyota or Honda is a good fit for the husband than we will look at an Optima CPO. We're heading to Carmax today to just get in a bunch of cars and see what kind of room there is in the front driver seat for him, 6 foot 4, and me who will be the main driver. \nDid you get a car yet? If not and are still considering the Optima, JD Power just ranked the 2018 Optima the #1 mid-sized car in its latest Vehicle Dependability Study. And Kia was the highest-ranked mass-market brand overall, behind only Lexus and Porsche. FYI the 2018 Sonata was the #2 mid-sized car.https://www.jdpower.com/business/press-releases/2021-us-vehicle-dependability-study-vds",18946,Member,"[kamcottage, saidthank, backy, since, first, posted, question, ive, come, realize, mcdawgg, said, want, toyota, honda, reliability, damn, used, longevity, cars, think, anything, less, would, make, unhappy, long, run, dont, patience, money, repair, stuff, anymore, since, driven, beaters, ground, dont, care, aesthetics, much, durability, said, toyota, honda, good, fit, husband, look, optima, cpo, heading, carmax, today, get, bunch, cars, see, kind, room, front, driver, seat, 6, foot, 4, main, driver, get, car, yet, still, considering, optima, jd, power, ranked, 2018, optima, 1, midsized, car, latest, vehicle, dependability, study, kia, highestranked, massmarket, brand, overall, behind, lexus, porsche, fyi, 2018, sonata, 2, midsized, ...]","[kamcottage, saidthank, backy, since, first, posted, question, ive, come, realize, mcdawgg, said, want, toyota, honda, reliability, damn, used, longevity, cars, think, anything, less, would, make, unhappy, long, run, dont, patience, money, repair, stuff, anymore, since, driven, beaters, ground, dont, care, aesthetics, much, durability, said, toyota, honda, good, fit, husband, look, kia, cpo, heading, carmax, today, get, bunch, cars, see, kind, room, front, driver, seat, 6, foot, 4, main, driver, get, car, yet, still, considering, kia, jd, power, ranked, 2018, kia, 1, midsized, car, latest, vehicle, dependability, study, kia, highestranked, massmarket, brand, overall, behind, lexus, porsche, fyi, 2018, sonata, 2, midsized, ...]","[kamcottage, saidthank, backy, since, first, posted, question, ive, come, realize, mcdawgg, said, want, toyota, honda, reliability, damn, used, longevity, cars, think, anything, less, would, make, unhappy, long, run, dont, patience, money, repair, stuff, anymore, since, driven, beaters, ground, dont, care, aesthetics, much, durability, said, toyota, honda, good, fit, husband, look, kia, cpo, heading, carmax, today, get, bunch, cars, see, kind, room, front, driver, seat, 6, foot, 4, main, driver, get, car, yet, still, considering, kia, jd, power, ranked, 2018, kia, 1, midsized, car, latest, vehicle, dependability, study, kia, highestranked, massmarket, brand, overall, behind, lexus, porsche, fyi, 2018, sonata, 2, midsized, ...]","[honda, kia, seat, toyota]",4
4148,2021-02-01,RayeEliza,"I am doing major research in trying to find my new car, which I am more leaning towards an SUV. Since I recently moved to Washington state and having to deal with snow, I will need a car that I can drive in snow without issues. I am also looking for somewhat good mileage and really good cargo space. I have only three cars I'm looking for that I am interested in which are the Subaru Outback, Toyota RAV4 hybrid, and the Subaru Forester. Which of these would you all recommend or if there are any others that sound just as good. The optimal price range is below $35,000.",6,Member,"[major, research, trying, find, new, car, leaning, towards, suv, since, recently, moved, washington, state, deal, snow, need, car, drive, snow, without, issues, also, looking, somewhat, good, mileage, really, good, cargo, space, three, cars, im, looking, interested, subaru, outback, toyota, rav4, hybrid, subaru, forester, would, recommend, others, sound, good, optimal, price, range, 35000]","[major, research, trying, find, new, car, leaning, towards, suv, since, recently, moved, washington, state, deal, snow, need, car, drive, snow, without, issues, also, looking, somewhat, good, mileage, really, good, cargo, space, three, cars, im, looking, interested, subaru, outback, toyota, rav4, hybrid, subaru, forester, would, recommend, others, sound, good, optimal, price, range, 35000]","[major, research, trying, find, new, car, leaning, towards, suv, since, recently, moved, washington, state, deal, snow, need, car, drive, snow, without, issues, also, looking, somewhat, good, mileage, really, good, cargo, space, three, cars, im, looking, interested, subaru, outback, toyota, rav4, hybrid, subaru, forester, would, recommend, others, sound, good, optimal, price, range, 35000]","[subaru, toyota]",2
4149,2021-02-01,mlevine,I would lean toward subaru. Need to see which model you are comfortable in. Mazda SUVs and honda SUVs also not a bad choice.,512,Member,"[would, lean, toward, subaru, need, see, model, comfortable, mazda, suvs, honda, suvs, also, bad, choice]","[would, lean, toward, subaru, need, see, model, comfortable, mazda, suvs, honda, suvs, also, bad, choice]","[would, lean, toward, subaru, need, see, model, comfortable, mazda, suvs, honda, suvs, also, bad, choice]","[honda, mazda, subaru]",3


In [61]:
def listtostring(text):
    return " ".join(text) 

In [63]:
df["Message_words_v4"] = df["Message_words_v3"].apply(listtostring)

In [66]:
vectorizer = TfidfVectorizer()

In [69]:
X = vectorizer.fit_transform(df["Message_words_v4"].tolist())

In [71]:
df_tfidf = pd.DataFrame(X[0].T.todense(), index=vectorizer.get_feature_names_out(), columns=["TF-IDF"])
df_tfidf = df_tfidf.sort_values('TF-IDF', ascending=False)

In [73]:
df_tfidf.to_csv("attribute_tfidf.csv")

In [45]:
# once we have the list of the 5 most popular attributes, my guess is we have to calculate the uplift scores of said attribute
# against the 5 top brands found in letter A). Then the uplift should tell us which brand is more related to which attribute

In [46]:
# let's say "reliability" is one of the popular attributes 
# and the top 5 brands are: honda, nissan, audi, bwm, toyota
# let's get the inputs necessary to calculate uplift of reliability in nissan

In [47]:
n=0
for i in  df.Brands_in_message:
    if 'nissan' in i:
        n += 1
n
#nissan appears in 525 rows 

525

In [48]:
df_nissan = pd.DataFrame(review_accum['nissan'], columns = ['word'])
(df_nissan.word == 'reliability').sum() # 73 mentions of "relibaility" in honda

# and 28 times the term "reliability" was mentioned alongside a nissan

28

In [49]:
# how about honda?

n=0
for i in  df.Brands_in_message:
    if 'honda' in i:
        n += 1
print(n)


df_honda = pd.DataFrame(review_accum['honda'], columns = ['word'])
(df_honda.word == 'reliability').sum() # 73 mentions of "relibaility" in honda


924


73

In [50]:
73/924 # not the exact uplift formula but a quick validation

0.07900432900432901

In [51]:
28/525 # vs nissan

0.05333333333333334

In [52]:
# honda has more prevalence as a reliable brand than nissan (pending to adjust for uplift real formula)

to consider:

right now the code counts all the appearances of an attribute, even if they come from the same record; I think that is not the exact definition of the uplift formula. I don't think it will change the numbers very much, but would be good to refine it if we have the time