### Loading processed discussion board data

In [100]:
import pandas as pd
import numpy as np
df = pd.read_pickle('data_clean.pkl')
df.Message_words = df.Message_words.apply(lambda x: [i.lower() for i in x])
df

Unnamed: 0,Date,User_Id,Message,NumberOfPastPosts,Role,Message_words
0,2001-09-01,pat,Need help choosing your next vehicle? Tell us ...,10421,Member,"[need, help, choosing, next, vehicle, tell, us..."
1,2001-09-01,willow25,"I'm buying a ""new"" car by the end of the year ...",2,Member,"[im, buying, new, car, end, year, first, choic..."
2,2001-10-01,dindak,Buy a 2001/ 2002 Oldsmobile Intrigue. It's one...,6632,Member,"[buy, 2001, 2002, oldsmobile, intrigue, one, b..."
3,2001-10-01,peteri1,Am looking at 1 of 4 choices to replace my 99G...,21,Member,"[looking, 1, 4, choices, replace, 99gs400, ive..."
4,2001-10-01,mrdetailer,Generally a 3-4 year old car range is a good o...,1118,Member,"[generally, 34, year, old, car, range, good, o..."
...,...,...,...,...,...,...
4146,2021-02-01,kyfdx,qbrozen said: show previous quoteskyfdx said:I...,203030,Moderator,"[qbrozen, said, show, previous, quoteskyfdx, s..."
4147,2021-02-01,backy,KamCottage said:Thank you backy. Since I first...,18946,Member,"[kamcottage, saidthank, backy, since, first, p..."
4148,2021-02-01,RayeEliza,I am doing major research in trying to find my...,6,Member,"[major, research, trying, find, new, car, lean..."
4149,2021-02-01,mlevine,I would lean toward subaru. Need to see which ...,512,Member,"[would, lean, toward, subaru, need, see, model..."


### Loading and cleaning models.csv

In [101]:
models = pd.read_csv('models.csv', header=None, names = ['brand', 'model'])
models['brand'] = models['brand'].str.replace(r'[^\w\s]+', '', regex=True)
models['model'] = models['model'].str.replace(r'[^\w\s]+', '', regex=True)

#drop row with: car, problem, seat 
searchfor = ["car", "problem", "seat", "sedan"]
models = models[~models.brand.str.contains('|'.join(searchfor))]

models

Unnamed: 0,brand,model
0,acura,integra
1,acura,Legend
2,acura,vigor
3,acura,rlx
4,acura,ILX
...,...,...
523,volvo,xc90
524,volvo,s60
525,volvo,s80
526,volvo,v60


### Replacing models with brands

In [102]:
messages = df.Message_words.to_list()

In [103]:
# numpy.where to do the replacement
messages2 = []
t = 0
for m in messages:
    for i,row in models.iterrows():
        m = np.where(m == row['model'], row['brand'], m)
        
    t+=1
    #print(t)
        
    messages2.append(m)
        
df['Message_words_v2'] = messages2

In [104]:
import joblib 
df.to_pickle('df_brands.pickle')

### Brand Frequency Count 

In [105]:
df = pd.read_pickle("df_brands.pickle")
df

Unnamed: 0,Date,User_Id,Message,NumberOfPastPosts,Role,Message_words,Message_words_v2
0,2001-09-01,pat,Need help choosing your next vehicle? Tell us ...,10421,Member,"[need, help, choosing, next, vehicle, tell, us...","[need, help, choosing, next, vehicle, tell, us..."
1,2001-09-01,willow25,"I'm buying a ""new"" car by the end of the year ...",2,Member,"[im, buying, new, car, end, year, first, choic...","[im, buying, new, car, end, year, first, choic..."
2,2001-10-01,dindak,Buy a 2001/ 2002 Oldsmobile Intrigue. It's one...,6632,Member,"[buy, 2001, 2002, oldsmobile, intrigue, one, b...","[buy, 2001, 2002, oldsmobile, intrigue, one, b..."
3,2001-10-01,peteri1,Am looking at 1 of 4 choices to replace my 99G...,21,Member,"[looking, 1, 4, choices, replace, 99gs400, ive...","[looking, 1, 4, choices, replace, 99gs400, ive..."
4,2001-10-01,mrdetailer,Generally a 3-4 year old car range is a good o...,1118,Member,"[generally, 34, year, old, car, range, good, o...","[generally, 34, year, old, car, range, good, o..."
...,...,...,...,...,...,...,...
4146,2021-02-01,kyfdx,qbrozen said: show previous quoteskyfdx said:I...,203030,Moderator,"[qbrozen, said, show, previous, quoteskyfdx, s...","[qbrozen, said, show, previous, quoteskyfdx, s..."
4147,2021-02-01,backy,KamCottage said:Thank you backy. Since I first...,18946,Member,"[kamcottage, saidthank, backy, since, first, p...","[kamcottage, saidthank, backy, since, first, p..."
4148,2021-02-01,RayeEliza,I am doing major research in trying to find my...,6,Member,"[major, research, trying, find, new, car, lean...","[major, research, trying, find, new, car, lean..."
4149,2021-02-01,mlevine,I would lean toward subaru. Need to see which ...,512,Member,"[would, lean, toward, subaru, need, see, model...","[would, lean, toward, subaru, need, see, model..."


In [106]:
mentions = []
brand_names = models.brand.unique()

for i in brand_names: 
    count = 0
    for j in df.Message_words_v2: 
        if(i in j):
            count +=1 
    
    mentions.append(count)

### Identifying top 10 brands by frequency:

In [107]:
brand_freq = pd.DataFrame({'brand': brand_names, 'mentions': mentions})
top10 = brand_freq.sort_values('mentions', ascending = False).head(10)
top10

Unnamed: 0,brand,mentions
9,honda,924
25,toyota,769
19,nissan,525
26,volkswagen,370
5,chevrolet,337
15,mazda,312
8,ford,284
2,bmw,275
11,hyundai,237
1,audi,224


# Top 10 Brands

In [108]:
top10v = top10.copy()

#percentage of total mentions
pct_list = []
for i in top10v.mentions: 
    pct = i*100/top10v.mentions.sum()
    pct = "{:.2f}".format(pct)
    pct_list.append(pct)

top10v["Percentage_Mentions"] = pct_list
top10v.columns = ['Brand', 'Total Mentions','% of Mentions']


#index starting from 1 
top10v = top10v.reset_index(drop=True)
top10v.index += 1 

#capitalizing brand names 
for i in range(len(top10v.Brand)): 
    top10v.Brand[i+1] = top10v.Brand[i+1].capitalize()


top10v



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Brand,Total Mentions,% of Mentions
1,Honda,924,21.71
2,Toyota,769,18.06
3,Nissan,525,12.33
4,Volkswagen,370,8.69
5,Chevrolet,337,7.92
6,Mazda,312,7.33
7,Ford,284,6.67
8,Bmw,275,6.46
9,Hyundai,237,5.57
10,Audi,224,5.26


In [109]:
brand_names = top10.brand

for i in brand_names: 
    y = []
    for j in df.Message_words_v2: 
        if(i in j):
            var = 1
            y.append(var)
        else: 
            var = 0
            y.append(var)  
    
    df[i] = y

    
df

Unnamed: 0,Date,User_Id,Message,NumberOfPastPosts,Role,Message_words,Message_words_v2,honda,toyota,nissan,volkswagen,chevrolet,mazda,ford,bmw,hyundai,audi
0,2001-09-01,pat,Need help choosing your next vehicle? Tell us ...,10421,Member,"[need, help, choosing, next, vehicle, tell, us...","[need, help, choosing, next, vehicle, tell, us...",0,0,0,0,0,0,0,0,0,0
1,2001-09-01,willow25,"I'm buying a ""new"" car by the end of the year ...",2,Member,"[im, buying, new, car, end, year, first, choic...","[im, buying, new, car, end, year, first, choic...",0,0,1,0,0,0,0,0,0,0
2,2001-10-01,dindak,Buy a 2001/ 2002 Oldsmobile Intrigue. It's one...,6632,Member,"[buy, 2001, 2002, oldsmobile, intrigue, one, b...","[buy, 2001, 2002, oldsmobile, intrigue, one, b...",0,0,0,0,0,0,0,0,0,0
3,2001-10-01,peteri1,Am looking at 1 of 4 choices to replace my 99G...,21,Member,"[looking, 1, 4, choices, replace, 99gs400, ive...","[looking, 1, 4, choices, replace, 99gs400, ive...",0,0,0,0,0,0,0,0,0,1
4,2001-10-01,mrdetailer,Generally a 3-4 year old car range is a good o...,1118,Member,"[generally, 34, year, old, car, range, good, o...","[generally, 34, year, old, car, range, good, o...",0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4146,2021-02-01,kyfdx,qbrozen said: show previous quoteskyfdx said:I...,203030,Moderator,"[qbrozen, said, show, previous, quoteskyfdx, s...","[qbrozen, said, show, previous, quoteskyfdx, s...",1,1,0,0,0,0,0,0,0,0
4147,2021-02-01,backy,KamCottage said:Thank you backy. Since I first...,18946,Member,"[kamcottage, saidthank, backy, since, first, p...","[kamcottage, saidthank, backy, since, first, p...",1,1,0,0,0,0,0,0,0,0
4148,2021-02-01,RayeEliza,I am doing major research in trying to find my...,6,Member,"[major, research, trying, find, new, car, lean...","[major, research, trying, find, new, car, lean...",0,1,0,0,0,0,0,0,0,0
4149,2021-02-01,mlevine,I would lean toward subaru. Need to see which ...,512,Member,"[would, lean, toward, subaru, need, see, model...","[would, lean, toward, subaru, need, see, model...",1,0,0,0,0,1,0,0,0,0


In [110]:
#calculating probability of brand occurences 

tot_messages = df.shape[0]
brand_names = top10.brand.to_list()

prob_df = pd.DataFrame(columns = brand_names)

for i in brand_names: 
    prob =[]
    for j in brand_names: 
        if i == j: 
            #calculating P(I)
            prob_val = df[i].sum()/tot_messages
            prob.append(prob_val)
        else:
            #calculating P(I&J)
            both = 0
            for c in range(tot_messages):
                if df[i][c] == 1 & df[j][c] == 1:
                    both += 1
            prob_val = both/tot_messages 
            prob.append(prob_val)
                             
    prob_df[i] = prob

#renaming the index 
prob_df.index = brand_names
prob_df      


Unnamed: 0,honda,toyota,nissan,volkswagen,chevrolet,mazda,ford,bmw,hyundai,audi
honda,0.222597,0.081908,0.050831,0.026259,0.024091,0.024813,0.025536,0.013009,0.024331,0.0106
toyota,0.081908,0.185257,0.037822,0.021682,0.026741,0.024091,0.022886,0.011323,0.020236,0.007468
nissan,0.050831,0.037822,0.126476,0.016141,0.013732,0.012286,0.008914,0.006745,0.010359,0.006986
volkswagen,0.026259,0.021682,0.016141,0.089135,0.0106,0.011323,0.008191,0.008191,0.006745,0.011082
chevrolet,0.024091,0.026741,0.013732,0.0106,0.081185,0.008673,0.017345,0.004095,0.007227,0.002891
mazda,0.024813,0.024091,0.012286,0.011323,0.008673,0.075163,0.011563,0.003614,0.008191,0.003854
ford,0.025536,0.022886,0.008914,0.008191,0.017345,0.011563,0.068417,0.004577,0.008914,0.003132
bmw,0.013009,0.011323,0.006745,0.008191,0.004095,0.003614,0.004577,0.066249,0.003132,0.016623
hyundai,0.024331,0.020236,0.010359,0.006745,0.007227,0.008191,0.008914,0.003132,0.057095,0.002168
audi,0.0106,0.007468,0.006986,0.011082,0.002891,0.003854,0.003132,0.016623,0.002168,0.053963


In [111]:
def lift_calculator(a,b, prob_df):
    prob_a = prob_df.loc[a,a]
    prob_b = prob_df.loc[b,b]
    porb_a_b = prob_df.loc[a,b]
    lift = porb_a_b/ (prob_a* prob_b)
    return lift
    

In [112]:
#calculating lift between brands 

lift_df = pd.DataFrame(columns = brand_names)

for i in brand_names:
    lift = []
    for j in brand_names:
        if i == j: 
            lift_val = None
            lift.append(lift_val)
        else: 
            lift_val = lift_calculator(i,j,prob_df)
            lift.append(lift_val)
    lift_df[i] = lift

#renaming the index 
lift_df.index = brand_names
lift_df  
            

Unnamed: 0,honda,toyota,nissan,volkswagen,chevrolet,mazda,ford,bmw,hyundai,audi
honda,,1.986247,1.805527,1.323444,1.333064,1.483076,1.67675,0.882149,1.914493,0.88244
toyota,1.986247,,1.614235,1.313007,1.77795,1.730102,1.805642,0.922553,1.913187,0.747033
nissan,1.805527,1.614235,,1.431748,1.337329,1.292436,1.030094,0.805042,1.434543,1.023631
volkswagen,1.323444,1.313007,1.431748,,1.464785,1.690029,1.34311,1.387066,1.325442,2.303885
chevrolet,1.333064,1.77795,1.337329,1.464785,,1.421251,3.122748,0.761446,1.559178,0.659866
mazda,1.483076,1.730102,1.292436,1.690029,1.421251,,2.248646,0.725699,1.908661,0.950321
ford,1.67675,1.805642,1.030094,1.34311,3.122748,2.248646,,1.009846,2.281854,0.848261
bmw,0.882149,0.922553,0.805042,1.387066,0.761446,0.725699,1.009846,,0.827971,4.649659
hyundai,1.914493,1.913187,1.434543,1.325442,1.559178,1.908661,2.281854,0.827971,,0.703718
audi,0.88244,0.747033,1.023631,2.303885,0.659866,0.950321,0.848261,4.649659,0.703718,


In [113]:
def make_pretty(styler):
    styler.set_caption("Weather Conditions")
    styler.background_gradient(axis=None, vmin=1, vmax=5, cmap="YlGnBu")
    return styler

make_pretty(lift_df.style)

Unnamed: 0,honda,toyota,nissan,volkswagen,chevrolet,mazda,ford,bmw,hyundai,audi
honda,,1.986247,1.805527,1.323444,1.333064,1.483076,1.67675,0.882149,1.914493,0.88244
toyota,1.986247,,1.614235,1.313007,1.77795,1.730102,1.805642,0.922553,1.913187,0.747033
nissan,1.805527,1.614235,,1.431748,1.337329,1.292436,1.030094,0.805042,1.434543,1.023631
volkswagen,1.323444,1.313007,1.431748,,1.464785,1.690029,1.34311,1.387066,1.325442,2.303885
chevrolet,1.333064,1.77795,1.337329,1.464785,,1.421251,3.122748,0.761446,1.559178,0.659866
mazda,1.483076,1.730102,1.292436,1.690029,1.421251,,2.248646,0.725699,1.908661,0.950321
ford,1.67675,1.805642,1.030094,1.34311,3.122748,2.248646,,1.009846,2.281854,0.848261
bmw,0.882149,0.922553,0.805042,1.387066,0.761446,0.725699,1.009846,,0.827971,4.649659
hyundai,1.914493,1.913187,1.434543,1.325442,1.559178,1.908661,2.281854,0.827971,,0.703718
audi,0.88244,0.747033,1.023631,2.303885,0.659866,0.950321,0.848261,4.649659,0.703718,


In [114]:
import numpy as np
np.tril(np.ones(lift_df.shape)).astype(np.bool)[0:10,0:10]
df_lt = lift_df.where(np.tril(np.ones(lift_df.shape)).astype(np.bool))
df_lt


#dealing with null values 
df_lt.fillna(df_lt.max().max()+1, inplace=True)

def color_max_white(val, max_val):
    color = 'white' if val == max_val else 'black'
    return 'color: %s' % color

def highlight_max(data, color='white'):
    attr = 'background-color: {}'.format(color)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

max_val = df_lt.max().max()
#df_lt.style.format("{:.2f}").background_gradient(cmap='YlGnBu', axis=None).applymap(lambda x: color_max_white(x, max_val)).apply(highlight_max, axis=None)

def make_pretty(styler):
    styler.set_caption("Lift Ratios")
    styler.background_gradient(cmap='YlGnBu', axis=None).applymap(lambda x: color_max_white(x, max_val)).apply(highlight_max, axis=None)
    return styler

make_pretty(df_lt.style)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



Unnamed: 0,honda,toyota,nissan,volkswagen,chevrolet,mazda,ford,bmw,hyundai,audi
honda,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659
toyota,1.986247,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659
nissan,1.805527,1.614235,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659
volkswagen,1.323444,1.313007,1.431748,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659
chevrolet,1.333064,1.77795,1.337329,1.464785,5.649659,5.649659,5.649659,5.649659,5.649659,5.649659
mazda,1.483076,1.730102,1.292436,1.690029,1.421251,5.649659,5.649659,5.649659,5.649659,5.649659
ford,1.67675,1.805642,1.030094,1.34311,3.122748,2.248646,5.649659,5.649659,5.649659,5.649659
bmw,0.882149,0.922553,0.805042,1.387066,0.761446,0.725699,1.009846,5.649659,5.649659,5.649659
hyundai,1.914493,1.913187,1.434543,1.325442,1.559178,1.908661,2.281854,0.827971,5.649659,5.649659
audi,0.88244,0.747033,1.023631,2.303885,0.659866,0.950321,0.848261,4.649659,0.703718,5.649659


### MDS Plot

In [134]:
#lift df without null values 
lift_df = pd.DataFrame(columns = brand_names)

for i in brand_names:
    lift = []
    for j in brand_names:
        if i == j: 
            lift_val = 0
            lift.append(lift_val)
        else: 
            lift_val = lift_calculator(i,j,prob_df)
            lift.append(lift_val)
    lift_df[i] = lift

#renaming the index 
lift_df.index = brand_names
lift_df  

Unnamed: 0,honda,toyota,nissan,volkswagen,chevrolet,mazda,ford,bmw,hyundai,audi
honda,0.0,1.986247,1.805527,1.323444,1.333064,1.483076,1.67675,0.882149,1.914493,0.88244
toyota,1.986247,0.0,1.614235,1.313007,1.77795,1.730102,1.805642,0.922553,1.913187,0.747033
nissan,1.805527,1.614235,0.0,1.431748,1.337329,1.292436,1.030094,0.805042,1.434543,1.023631
volkswagen,1.323444,1.313007,1.431748,0.0,1.464785,1.690029,1.34311,1.387066,1.325442,2.303885
chevrolet,1.333064,1.77795,1.337329,1.464785,0.0,1.421251,3.122748,0.761446,1.559178,0.659866
mazda,1.483076,1.730102,1.292436,1.690029,1.421251,0.0,2.248646,0.725699,1.908661,0.950321
ford,1.67675,1.805642,1.030094,1.34311,3.122748,2.248646,0.0,1.009846,2.281854,0.848261
bmw,0.882149,0.922553,0.805042,1.387066,0.761446,0.725699,1.009846,0.0,0.827971,4.649659
hyundai,1.914493,1.913187,1.434543,1.325442,1.559178,1.908661,2.281854,0.827971,0.0,0.703718
audi,0.88244,0.747033,1.023631,2.303885,0.659866,0.950321,0.848261,4.649659,0.703718,0.0


In [135]:
from sklearn.manifold import MDS

embedding = MDS(n_components=2, random_state = 42)
lift_df_transformed = embedding.fit_transform(lift_df)
lift_df_transformed.shape

mds_df = pd.DataFrame(lift_df_transformed)
mds_df['names'] = brand_names
mds_df.columns = ['component0', 'component1', 'brand']

#graph 
import plotly.express as px
import plotly.io as pio
pio.renderers
pio.renderers.default = "notebook_connected"


fig = px.scatter(mds_df , x="component0", y= "component1", text = "brand")
fig.update_traces(textposition='top center')

fig.show()



The MDS API has changed. ``fit`` now constructs an dissimilarity matrix from data. To use a custom dissimilarity matrix, set ``dissimilarity='precomputed'``.



In [131]:
from sklearn.manifold import MDS

embedding = MDS(n_components=2, random_state =8)
lift_df_transformed = embedding.fit_transform(lift_df[:10])
lift_df_transformed.shape

mds_df = pd.DataFrame(lift_df_transformed)
mds_df['names'] = brand_names
mds_df.columns = ['component0', 'component1', 'brand']


#graph 
import plotly.express as px
import plotly.io as pio
pio.renderers
pio.renderers.default = "notebook_connected"


fig = px.scatter(mds_df , x="component0", y= "component1", text = "brand")
fig.update_traces(textposition='top center')

fig.show()


The MDS API has changed. ``fit`` now constructs an dissimilarity matrix from data. To use a custom dissimilarity matrix, set ``dissimilarity='precomputed'``.

