In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None) # to read records completely
import pickle
import numpy as np
import itertools
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

## Previous data prep necessary for c) and d)

In [2]:
df = pd.read_pickle('data_clean.pkl')

In [3]:
models = pd.read_csv('models.csv', header=None, names = ['brand', 'model'])
models['brand'] = models['brand'].str.replace(r'[^\w\s]+', '', regex=True) # clean some punctuation errors (nissan.)

In [4]:
models = models[~models.brand.isin(['car', 'sedan', 'problem'])]

In [5]:
df.Message.str.contains('said').sum()

370

In [6]:
df.Message_words = df.Message_words.apply(lambda x: [i.lower() for i in x])

### Replace models with brands

In [7]:
messages = df.Message_words.to_list()

In [None]:
# numpy.where to do the replacement
messages2 = []
t = 0
for m in messages:
    for i,row in models.iterrows():
        m = np.where(m == row['model'], row['brand'], m)
        
    t+=1
    
    #print(t)
        
    messages2.append(m)
        
df['Message_words_v2'] = messages2

In [None]:
# let's find the occurences of changes
m1 = df.Message_words.to_list()
m2 = df.Message_words_v2.to_list()

c = 0
W = 0
for i in range(len(m2)):
    for j in range(len(m2[i])):
        if m1[i][j] != m2[i][j]: #word by word comparison
            c +=1
        W += 1

In [None]:
print(f'{c} changes of models')

In [None]:
df.head(10)

In [None]:
#Get a unique list of brands
brands_list = models.brand.drop_duplicates().to_list()

In [None]:
#Removing duplicates in lists while keeping order
#import itertools
def remove_consecutive_duplicate (text):
    return np.array([i for i, j in itertools.groupby(text)])

In [None]:
df["Message_words_v3"] = df["Message_words_v2"].apply(remove_consecutive_duplicate)

In [None]:
type(df["Message_words_v2"][0])

In [None]:
type(df["Message_words_v3"][0])

In [None]:
df.head(10)

In [None]:
#Removing duplicate brands in lists but only

## Link attributes to models

In [None]:
# first lets count brand mentions in each record

In [None]:
messages = df.Message_words_v3.to_list()

In [None]:
brands_in_message = []
for m in messages:
    
    brands_mentioned = []
    for brand in brands_list:
        if len(np.where(m == brand)[0]) >0:
            brands_mentioned.append(brand)
            
    brands_in_message.append(brands_mentioned)
        
    

In [None]:
df['Brands_in_message'] = brands_in_message

In [None]:
df['Qty_brands'] = df.Brands_in_message.apply(lambda x: len(x))

In [None]:
df['Qty_brands'].value_counts()

# Most messages discuss a single brand
# Also common to compare 2 brands
# 727 messages don't mention any brand (remove them from analysis?)
# Passing a certain treshold (5+), I think it would be better to remove comments since it will be hard to figure out which
# attribute belongs to each brand

In [None]:
df.loc[df.Qty_brands == 0, 'Message_words_v3'].head(10) # to review records with no brand mentions
# most of them are for seeking advice

In [None]:
m = df.Message_words_v3[9] #record 9 is a good exaple
brand_dic = {}
for brand in brands_list:
    ix = np.where(m == brand)[0]
    if len(ix) > 0:
        brand_dic[brand] = ix[0]
        

brand_dic = dict(sorted(brand_dic.items(), key=lambda x:x[1]))

brand_list = list(brand_dic.values()) + [len(m)]

for i, tup in enumerate(brand_dic):
    print(brand_dic[tup])
    print(tup)
    
    if i == 0:
        
        print(m[0:brand_list[1]])
    else:
        print(m[brand_list[i]:brand_list[i+1]])
        
        

In [None]:
# now we run it for all records

brand_reviews = []

for m in df.Message_words_v3:

    brand_dic = {}
    for brand in brands_list:
        ix = np.where(m == brand)[0]
        if len(ix) > 0:
            brand_dic[brand] = ix[0]

    brand_dic = dict(sorted(brand_dic.items(), key=lambda x:x[1]))

    brand_list = list(brand_dic.values()) + [len(m)]


    review={}
    for i, tup in enumerate(brand_dic):
        #print(brand_dic[tup])
        #print(tup)

        if i == 0:
            
            review[tup] = m[0:brand_list[1]]
        else:
            review[tup] = m[brand_list[i]:brand_list[i+1]]
            
    brand_reviews.append(review)

# generates a list (1 entry per row) of dictionaries {brand: part of text corresponding}

In [None]:
len(brand_reviews) # each record becomes a single dictionary with all its brand mentions and linked words

In [None]:
brand_reviews[9]

In [None]:
# now we need to concatenate all reviews for each brand into a single key in a dictionary
review_accum = {i:[] for i in brands_list}

In [None]:
for brand in brands_list:
    for review in brand_reviews:
        for single_review in review:
            #print(review[single_review])
            if single_review==brand:
                review_accum[brand].extend(review[single_review])

In [None]:
## from here, we are ready to start counting attributes per brand
## we also need to consider replacing some similar attributes before doing the final count 
## I will do one example with Audi but it could become a function and run in a loop to all the brands 

In [None]:
# first lets discover popular attributes regardless of brands

all_attributes = []

for i in list(review_accum.values()):
    all_attributes.extend(i)
    
words = pd.Series(all_attributes)

words.value_counts().head(40) # this shows the frequencies of each word
# the most "rustic" approach would be to literally print them in order and take note of the ones that can be considered "attributes"

In [None]:
words[~words.isin(brands_list)].value_counts().head(60)


In [None]:
words_count = words.value_counts()
words_nobrands_count = words[~words.isin(brands_list)].value_counts()

### Stemming

In [None]:
#Stemming
stemmer = PorterStemmer()

In [None]:
def stem_list (text):
    return pd.Series([stemmer.stem(w) for w in text])

In [None]:
stemmed_words = stem_list(words)
len(stemmed_words)

In [None]:
stemmed_words.value_counts().head(60)

In [None]:
stemmed_words[~stemmed_words.isin(brands_list)].value_counts().head(60)


In [None]:
stemwords_count = stemmed_words.value_counts()
stemwords_nobrands_count = stemmed_words[~stemmed_words.isin(brands_list)].value_counts()

In [None]:
#Export value count series to a single multi-sheet Excel
#pip install xlsxwriter
writer = pd.ExcelWriter('attribute_count.xlsx', engine='xlsxwriter')


In [None]:
words_count.to_excel(writer, sheet_name='words_valuecounts')
words_nobrands_count.to_excel(writer, sheet_name='words_nobrands_valuecounts')
stemwords_count.to_excel(writer, sheet_name='stemwords_valuecounts')
stemwords_nobrands_count.to_excel(writer, sheet_name='stemwords_nobrands_valuecounts')

In [None]:
writer.save()


### TF-IDF

In [None]:
df

In [None]:
# once we have the list of the 5 most popular attributes, my guess is we have to calculate the uplift scores of said attribute
# against the 5 top brands found in letter A). Then the uplift should tell us which brand is more related to which attribute

In [None]:
# let's say "reliability" is one of the popular attributes 
# and the top 5 brands are: honda, nissan, audi, bwm, toyota
# let's get the inputs necessary to calculate uplift of reliability in nissan

In [None]:
n=0
for i in  df.Brands_in_message:
    if 'nissan' in i:
        n += 1
n
#nissan appears in 525 rows 

In [None]:
df_nissan = pd.DataFrame(review_accum['nissan'], columns = ['word'])
(df_nissan.word == 'reliability').sum() # 73 mentions of "relibaility" in honda

# and 28 times the term "reliability" was mentioned alongside a nissan

In [None]:
# how about honda?

n=0
for i in  df.Brands_in_message:
    if 'honda' in i:
        n += 1
print(n)


df_honda = pd.DataFrame(review_accum['honda'], columns = ['word'])
(df_honda.word == 'reliability').sum() # 73 mentions of "relibaility" in honda


In [None]:
73/924 # not the exact uplift formula but a quick validation

In [None]:
28/525 # vs nissan

In [None]:
# honda has more prevalence as a reliable brand than nissan (pending to adjust for uplift real formula)

to consider:

right now the code counts all the appearances of an attribute, even if they come from the same record; I think that is not the exact definition of the uplift formula. I don't think it will change the numbers very much, but would be good to refine it if we have the time