In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 1000) # to read records completely
import pickle
import numpy as np
import itertools
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

## Import data from previous preprocessing steps

In [None]:
# Importing scraped and pre-processed data  
df = pd.read_pickle('data_clean.pkl')

In [None]:
models = pd.read_csv('models.csv', header=None, names = ['brand', 'model'])
models['brand'] = models['brand'].str.replace(r'[^\w\s]+', '', regex=True) # remove punctuation errors (e.g. "nissan.")
models = models[~models.brand.isin(['car', 'sedan', 'problem'])] # remove rows that are not brands

In [None]:
# Convert all message words to lowercase
df.Message_words = df.Message_words.apply(lambda x: [i.lower() for i in x])

### Replace models with brands

In [None]:
messages = df.Message_words.to_list()

In [None]:
# Replace first occurence of brand
messages2 = []
t = 0
for m in messages:
    for i,row in models.iterrows():
        m = np.where(m == row['model'], row['brand'], m)
        
    t+=1
    
    #print(t)
        
    messages2.append(m)
        
df['Message_words_v2'] = messages2

In [None]:
# There exists occurences where the text says "nissan, nissan"
# Remove duplicates in lists while keeping order
def remove_consecutive_duplicate (text): #removes consecutive duplicates
    return np.array([i for i, j in itertools.groupby(text)])

df["Message_words_v3"] = df["Message_words_v2"].apply(remove_consecutive_duplicate)

## Link attributes to models

In [None]:
# Retrieve a unique list of brands in the text
brands_list = models.brand.drop_duplicates().to_list()

# Add additional brands not covered
brands_list.extend(['lexus', 'ferrari', 'merzedesbenz', 'tesla','gm', 'peugeot', 'jeep', 'bentley', 'fiat'])

In [None]:
messages = df.Message_words_v3.to_list()

# Returns a list of brands mentioned in each review
brands_in_message = []
for m in messages:
    
    brands_mentioned = []
    for brand in brands_list:
        if len(np.where(m == brand)[0]) >0:
            brands_mentioned.append(brand)
            
    brands_in_message.append(brands_mentioned)

brands_in_message

In [None]:
df['Brands_in_message'] = brands_in_message
df['Qty_brands'] = df.Brands_in_message.apply(lambda x: len(x))
df['Qty_brands'].value_counts()

# 1291 messages don't mention any brand

We can assign attributes mentioned in the reviews, like the power of a car, to a specific brand. For example, in the sentence "I like the BMW for its power. On the other hand, the Honda is reliable" we would want to assign the the attribute "power" to "BMW" and "reliable" to "Honda". We have created functions for two different approaches:
1) assign all words found between 1st brand mention and next brand mention to 1st brand

2) assign n words to each side of the brand mention to the brand. We found n = 4 to work best. 

In [None]:
# First approach: link all words found between brand mention and next brand mention...
# ...except for the first brand mention which receives words from the start of the message

m = df.Message_words_v3[9] # record 9 is a good example

def get_attr_in_the_right(m, brand_list):

    brand_dic = {}
    for brand in brands_list:
        ix = np.where(m == brand)[0]
        if len(ix) > 0:
            brand_dic[brand] = ix[0]

    # Dict of brand_dic['brand'] = {all words to the right}
    brand_dic = dict(sorted(brand_dic.items(), key=lambda x:x[1]))
    
    # Combined list of all words
    brand_list = list(brand_dic.values()) + [len(m)]

    review={}
    for i, tup in enumerate(brand_dic):        
        if i == 0:            
            review[tup] = m[0:brand_list[1]]
        else:
            review[tup] = m[brand_list[i]:brand_list[i+1]]
    return review

get_attr_in_the_right(m, brands_list)

In [None]:
# Do this for all records

brand_reviews_1st_approach = []

for m in df.Message_words_v3:

    brand_reviews_1st_approach.append(get_attr_in_the_right(m, brands_list))

# generates a list (1 entry per row) of dictionaries {brand_1: part of text corresponding, 
# brand_2: part of text corresponding,}

In [None]:
# 2nd approach: consider n words to each side when associating with brands

m = df.Message_words_v3[9]

def get_attr_n_per_side(m, n, brand_list):

    brand_dic = {}
    for brand in brands_list:
        ix = np.where(m == brand)[0]
        if len(ix) > 0:
            brand_dic[brand] = ix[0]


    brand_dic = dict(sorted(brand_dic.items(), key=lambda x:x[1]))
    
    #print(len(brand_dic))

    brand_list = list(brand_dic.values()) + [len(m)]
    
    #print(brand_list)

    review={}
    for i, tup in enumerate(brand_dic):

        left_loc = (brand_list[i] - n)
        left_loc = max(0, left_loc) # to avoid out of range indexing
        
        right_loc = (brand_list[i] + n)
        right_loc = min(len(m), right_loc) # to avoid OOF indexing
        
        review[tup] = m[left_loc:right_loc]
            
    return review

get_attr_n_per_side(m, 5, brands_list)        

In [None]:
# Do this for all records
brand_reviews_2nd_approach = []

for m in df.Message_words_v3:

    brand_reviews_2nd_approach.append(get_attr_n_per_side(m,4, brands_list))

# Generates a list (each row is a post) of dictionaries where {brand_1: part of text corresponding, brand_2: part of text corresponding}

In [None]:
# Concatenate all reviews for each brand into a single key in a dictionary
# Creates dictionary: each brand mentioned is a key, value is all words assigned to that brand 
# According to 1st method get_attr_in_the_right()

review_accum_1st = {i:[] for i in brands_list} #deprecated dictionary but still interesting

for brand in brands_list:
    for review in brand_reviews_1st_approach:
        for single_review in review:
            #print(review[single_review])
            if single_review==brand:
                review_accum_1st[brand].extend(review[single_review])

In [None]:
# Same as above but for 2nd method: get_attr_n_per_side()
review_accum_2nd = {i:[] for i in brands_list} 

for brand in brands_list:
    for review in brand_reviews_2nd_approach:
        for single_review in review:
            #print(review[single_review])
            if single_review==brand:
                review_accum_2nd[brand].extend(review[single_review])

In [None]:
# Create Series with all words 
words = pd.Series([i for review in df.Message_words_v3 for i in review])

In [None]:
# Get value count for words excluding brands
words[~words.isin(brands_list)].value_counts().head(60)


In [None]:
words_count = words.value_counts()
words_nobrands_count = words[~words.isin(brands_list)].value_counts()

### Stemming

In [None]:
# NOTE: Stemming could potentially cause information loss if words are unintentionally reduced in a way that would result in a 
# common stem being used for different words 
''' Stemming
stemmer = PorterStemmer()

def stem_list (text):
    return pd.Series([stemmer.stem(w) for w in text])

stemmed_words = stem_list(words)
len(stemmed_words)

stemmed_words[~stemmed_words.isin(brands_list)].value_counts().head(60)

stemwords_count = stemmed_words.value_counts()
stemwords_nobrands_count = stemmed_words[~stemmed_words.isin(brands_list)].value_counts()
'''


In [None]:
#For words_nobrands_count remove words that appear 2x or less
words_nobrands_count = words_nobrands_count[words_nobrands_count > 2]
words_nobrands_count

# This csv is used to create the attribute map
#words_nobrands_count.to_csv("attribute_count_final.csv")


## Attribute mapping

Attribute mapping was done by manually browsing rows of xlsx and linking words we found relevant to a common key

In [None]:
df_map = pd.read_excel("attribute_count_final.xlsx", sheet_name = 0)
df_map = df_map.drop('Unnamed: 4', axis=1)

In [None]:
# Where no replacement exists, fill it with the original word
df_map['attribute_flg'] = df_map['attribute_flg'].replace(np.nan, 0)
df_map.attribute_synonym.fillna(df_map.word, inplace=True)


In [None]:
# Sum the frequencies of every attribute group after this and show top results 
df_pivot_map = df_map[df_map.attribute_flg == 1.0].groupby('attribute_synonym').freq.sum().sort_values(ascending=False)
df_pivot_map.head(10)

The top 5 attributes are: affordability, sustainability, size, driveability and engine

In [None]:
# Replace words with attributes in df.Message_words_v3
def replace_attributes(m):
    for i,row in df_map.iterrows():
        m = np.where(m == row['word'], row['attribute_synonym'], m)
    return m

In [None]:
# Replaces attributes (Takes ~20-30 mins to run)
df["Message_words_v3"] = df["Message_words_v3"].apply(replace_attributes)

In [None]:
# Drop Message_words_v2 for size
df_saved = df.copy()
df = df.drop("Message_words_v2", axis=1)

In [None]:
# Create checkpoint csv to avoid long function call later
# df.to_pickle("data_checkpoint.pkl")

# Calculating lift scores

In [None]:
# Import checkpoint csv
df = pd.read_pickle("data_checkpoint.pkl")

In [None]:
# Calculates lift between a brand and an attribute
def calculate_lift(brand_name, attribute_name):
    
    # Used to get counts in separate table
    string_list = []
    
    # Get total number of reviews
    review_count = len(df)
    string_list.append(f"There are {review_count} reviews")

    # Count number of times brand appears across reviews
    brand_count = 0
    for review in  df.Message_words_v3:
        if brand_name in review:
            brand_count += 1
    string_list.append(f"{brand_name} appears {brand_count} times")
    
    # Get number of times attribute appears across reviews
    attribute_count = 0
    for review in df.Message_words_v3:
            if attribute_name in review:
                attribute_count += 1
    string_list.append(f"{attribute_name} appears {attribute_count} times")

    # Count number of times attribute and brand appear together
    attr_brand_count = 0
    for review in df.Message_words_v3:
        if brand_name in review and attribute_name in review:
            attr_brand_count += 1
    string_list.append(f"{attribute_name} and {brand_name} appears together {attr_brand_count} times")
    
    # Calculate lift
    lift = review_count * (attr_brand_count/(brand_count * attribute_count))
    
    return lift, string_list

In [None]:
# Define top brands and attributes as described in parts A and C/D
top_attributes = ['affordability', 'sustainability', 'size', 'driveability', 'engine']
top_brands = ['honda','toyota','nissan','volkswagen','chevrolet']

In [None]:
# Initialize dataframes to store lift and support values
df_lift = pd.DataFrame(index=top_brands, columns = top_attributes)
df_lift_counts = pd.DataFrame(index=top_brands, columns = top_attributes)

In [None]:
# Generate matrix for lift calculation
for brand in top_brands:
    for attribute in top_attributes:
        df_lift.loc[brand, attribute], df_lift_counts.loc[brand, attribute] = calculate_lift(brand, attribute)

In [None]:
df_lift

In [None]:
df_lift_counts

## Calculating lift from brand-attribute assignment lists
brand_reviews_1st_approach and brand_reviews_2nd_approach


In [None]:
# Get split reviews with replaced attributes
# Generates a list (1 entry per row) of dictionaries {brand_1: part of text corresponding, brand_2: part of text corresponding}
brand_reviews_1st_mapped = []

for m in df.Message_words_v3:

    brand_reviews_1st_mapped.append(get_attr_in_the_right(m, brands_list))

brand_reviews_2nd_mapped = []

for m in df.Message_words_v3:

    brand_reviews_2nd_mapped.append(get_attr_n_per_side(m,4, brands_list))

In [None]:
# Calculate split from list of dictionaries brand_reviews_1st_mapped and brand_reviews_2nd_mapped

def calculate_lift_split(brand_name, attribute_name, review_list):
    
    # Used to get counts in separate table
    string_list = []
    
    # Get total number of reviews - defined as a part of a forum post that talks about a specific brand
    # Each review is therefore a split of the entire post, the split being made in 2 different ways as seen above (get_attr_in_the_right, get_attr_n_per_side )
    
    review_count = 0
    for review in review_list: #review_list is a list of dicts, so review is a dict
        review_count += len(review)
    string_list.append(f"There are {review_count} reviews about specific brands")

    # Count number of times brand appears across split reviews
    brand_count = 0
    for review in review_list: #review_list is a list of dicts, so review is a dict
        for key in review.keys():
            if brand_name == key:
                brand_count += 1
    string_list.append(f"{brand_name} appears {brand_count} times")
    
    # Count number of times attribute appears across split reviews
    attribute_count = 0
    for review in review_list:
        for value in review.values():
            if attribute_name in value:
                attribute_count += 1
    string_list.append(f"{attribute_name} appears {attribute_count} times")

    # Count number of times attribute and brand appear together
    attr_brand_count = 0
    for review in brand_reviews_1st_mapped:
        for key, value in review.items():
            if brand_name == key and attribute_name in value:
                attr_brand_count += 1
    string_list.append(f"{attribute_name} and {brand_name} appears together {attr_brand_count} times")
   
    # Calculate lift
    lift = review_count * (attr_brand_count/(brand_count * attribute_count))
    return lift, string_list

In [None]:
# Initialize dataframes
df_lift_1st_mapped = pd.DataFrame(index=top_brands, columns = top_attributes)
df_lift_1st_counts = pd.DataFrame(index=top_brands, columns = top_attributes)
df_lift_2nd_mapped = pd.DataFrame(index=top_brands, columns = top_attributes)
df_lift_2nd_counts = pd.DataFrame(index=top_brands, columns = top_attributes)

In [None]:
# Getting lift for brand_reviews_1st_mapped
for brand in top_brands:
    for attribute in top_attributes:
        df_lift_1st_mapped.loc[brand, attribute], df_lift_1st_counts.loc[brand, attribute] = calculate_lift_split(brand, attribute, brand_reviews_1st_mapped)

In [None]:
df_lift_1st_mapped

In [None]:
df_lift_1st_counts

In [None]:
# Getting lift for brand_reviews_2nd_mapped
for brand in top_brands:
    for attribute in top_attributes:
        df_lift_2nd_mapped.loc[brand, attribute], df_lift_2nd_counts.loc[brand, attribute] = calculate_lift_split(brand, attribute, brand_reviews_2nd_mapped)

In [None]:
df_lift_2nd_mapped

In [None]:
df_lift_2nd_counts