### Loading processed discussion board data

In [None]:
#Loading the clean dataset from the pickle
import pandas as pd
import numpy as np
df = pd.read_pickle('data_clean.pkl')
df.Message_words = df.Message_words.apply(lambda x: [i.lower() for i in x])
df

### Loading and cleaning models.csv

In [None]:
#fetching the models csv
models = pd.read_csv('models.csv', header=None, names = ['brand', 'model'])
#removing unwanted characters
models['brand'] = models['brand'].str.replace(r'[^\w\s]+', '', regex=True)
models['model'] = models['model'].str.replace(r'[^\w\s]+', '', regex=True)

#dropping rows with: car, problem, seat, sedan
searchfor = ["car", "problem", "seat", "sedan"]
models = models[~models.brand.str.contains('|'.join(searchfor))]

models

### Replacing models with brands

In [None]:
#creating a list from the messages colum
messages = df.Message_words.to_list()

# numpy.where to do the replacement : replace models with their brands
messages2 = []
t = 0
for m in messages:
    for i,row in models.iterrows():
        m = np.where(m == row['model'], row['brand'], m)
        
    t+=1
    #print(t)
        
    messages2.append(m)
        
df['Message_words_v2'] = messages2

In [None]:
#saving the newly obtained dataframe as a pickle
import joblib 
df.to_pickle('df_brands.pickle')


### Brand Frequency Count 

In [None]:
#loading dataframe with messages where models have been replaced with brands 
df = pd.read_pickle("df_brands.pickle")

mentions = []
brand_names = models.brand.unique()

#creating a dummy column for each brand, 1 = brand has been mentioned in the message 
for i in brand_names: 
    count = 0
    for j in df.Message_words_v2: 
        if(i in j):
            count +=1 
    
    mentions.append(count)

### Task A - Identifying Top 10 Brands 

In [None]:
#dataframe with all brand mentions
brand_freq = pd.DataFrame({'brand': brand_names, 'mentions': mentions})
#dataframe with top 10 
top10 = brand_freq.sort_values('mentions', ascending = False).head(10)

top10v = top10.copy()

#adding column: percentage of mentions, per brand 
pct_list = []
for i in top10v.mentions: 
    pct = (i*100)/brand_freq.mentions.sum()
    pct = "{:.2f}".format(pct)
    pct_list.append(pct)

top10v["Percentage_Mentions"] = pct_list
#renaming columns 
top10v.columns = ['Brand', 'Total_Mentions','% of Mentions']

#index starting from 1 
top10v = top10v.reset_index(drop=True)
top10v.index += 1 

#capitalizing brand names 
for i in range(len(top10v.Brand)): 
    top10v.Brand[i+1] = top10v.Brand[i+1].capitalize()

top10v


In [None]:
#all brands, mentions 
brand_freq.head(20)

In [None]:
import matplotlib.pyplot as plt

#pie chart grouped by k-means clusters
all_df = top10.copy()

others_val = brand_freq.mentions.sum() - top10.mentions.sum()

#index starting from 1 
all_df = all_df.reset_index(drop=True)
all_df.index += 1 

#capitalizing brand names 
for i in range(len(all_df.brand)): 
    all_df.brand[i+1] = all_df.brand[i+1].capitalize()

all_df.loc[11] = ['Other', others_val]


k_labels = [0,0,0,2,2,2,2,1,0,1,3]
#appending labels from k-means clustering 
all_df['labels'] = k_labels 
all_df = all_df.sort_values('labels')

#plot parameters
fig, ax = plt.subplots()
ax.set_aspect('equal')

data = all_df.mentions
wedges, texts, percs = ax.pie(data, labels= all_df.brand,
                              autopct="%1.1f%%")

groups = [[0, 1, 2,3], [ 4,5], [6, 7,8,9],[10]]

radfraction = 0.05


#computing the plot
for group in groups:
    ang = np.deg2rad((wedges[group[-1]].theta2 + wedges[group[0]].theta1) / 2)
    for j in group:
        center = radfraction * wedges[j].r * np.array([np.cos(ang), np.sin(ang)])
        wedges[j].set_center(center)
        texts[j].set_position(np.array(texts[j].get_position()) + center)
        percs[j].set_position(np.array(percs[j].get_position()) + center)

ax.autoscale(True)
plt.show()



In [None]:
#bar plot of total mentions
import plotly.express as px
fig = px.bar(top10v, y='Total_Mentions', x='Brand', text='Total_Mentions')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [None]:
#creating dummy variables for each brand mention
brand_names = top10.brand

for i in brand_names: 
    y = []
    for j in df.Message_words_v2: 
        if(i in j):
            var = 1
            y.append(var)
        else: 
            var = 0
            y.append(var)  
    
    df[i] = y

    
df

In [None]:
#calculating probability of brand occurences 

tot_messages = df.shape[0]
brand_names = top10.brand.to_list()

prob_df = pd.DataFrame(columns = brand_names)

for i in brand_names: 
    prob =[]
    for j in brand_names: 
        if i == j: 
            #calculating P(I)
            prob_val = df[i].sum()/tot_messages
            prob.append(prob_val)
        else:
            #calculating P(I&J)
            both = 0
            for c in range(tot_messages):
                if df[i][c] == 1 & df[j][c] == 1:
                    both += 1
            prob_val = both/tot_messages 
            prob.append(prob_val)
                             
    prob_df[i] = prob

#renaming the index 
prob_df.index = brand_names
prob_df      


In [None]:
#defining the lift calculation function
def lift_calculator(a,b, prob_df):
    prob_a = prob_df.loc[a,a]
    prob_b = prob_df.loc[b,b]
    porb_a_b = prob_df.loc[a,b]
    lift = porb_a_b/ (prob_a* prob_b)
    return lift
    

In [None]:
#calculating lift between brands 

lift_df = pd.DataFrame(columns = brand_names)

for i in brand_names:
    lift = []
    for j in brand_names:
        if i == j: 
            lift_val = None
            lift.append(lift_val)
        else: 
            lift_val = lift_calculator(i,j,prob_df)
            lift.append(lift_val)
    lift_df[i] = lift

#renaming the index 
lift_df.index = brand_names
lift_df  
            

In [None]:
#computing the lower triangle of the array
import numpy as np
np.tril(np.ones(lift_df.shape)).astype(np.bool)[0:10,0:10]
df_lt = lift_df.where(np.tril(np.ones(lift_df.shape)).astype(np.bool))
df_lt


#dealing with null values 
df_lt.fillna(df_lt.max().max()+1, inplace=True)


#adding color coding
def color_max_white(val, max_val):
    color = 'white' if val == max_val else 'black'
    return 'color: %s' % color

def highlight_max(data, color='white'):
    attr = 'background-color: {}'.format(color)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

max_val = df_lt.max().max()

def make_pretty(styler):
    styler.set_caption("Lift Ratios")
    styler.background_gradient(cmap='YlGnBu', axis=None).applymap(lambda x: color_max_white(x, max_val)).apply(highlight_max, axis=None)
    return styler

make_pretty(df_lt.style)

### MDS Plot

In [None]:
#inverse of lift: dissimilarity  measure 
diss_df = pd.DataFrame(columns = brand_names)

for i in brand_names:
    diss = []
    for j in brand_names:
        if i == j: 
            diss_val = 0
            diss.append(diss_val)
        else: 
            diss_val = 1/lift_calculator(i,j,prob_df)
            diss.append(diss_val)
    diss_df[i] = diss

#renaming the index 
diss_df.index = brand_names
diss_df  

In [None]:
from sklearn.manifold import MDS

#reducing to two component for x,y plotting, fixing random state to observe same results each iteration
embedding = MDS(n_components=2, random_state = 42)
diss_df_transformed = embedding.fit_transform(diss_df)
diss_df_transformed.shape

mds_df = pd.DataFrame(diss_df_transformed)
mds_df['names'] = brand_names
mds_df.columns = ['component0', 'component1', 'brand']

#capitalizing brand names 
for i in range(len(mds_df.brand)): 
    mds_df.brand[i] = mds_df.brand[i].capitalize()


In [None]:
import plotly.io as pio
pio.renderers
pio.renderers.default = "notebook_connected"

#plotting the MDS
fig = px.scatter(mds_df , x="component0", y= "component1", text = "brand")
fig.update_traces(textposition='top center')

fig.update_layout(
    title_text='Brand Co-Mentions in the Forum for Finding the Ideal Car',
    yaxis_title = None,
    xaxis_title = None
)

fig.update_traces(textfont_size=14)


fig.show()


In [None]:
#Setting up the data for kmeans clustering on the MDS plot 
mds_df = mds_df.set_index('brand')
mds_df

In [None]:
#mds plot: brand comentions 
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
import numpy as np
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, figsize=(8,8))

    # plot data
    x = mds_df.component0
    y = mds_df.component1
    plt.scatter(x, y, s=100)
    
    n = mds_df.index
    for i, txt in enumerate(n):
        ax.annotate(txt, (x[i], y[i]))   

In [None]:
#plotting the elbow method to determine ideal number of clusters
from sklearn.cluster import KMeans

distortions = []
K = range(1,5)

for k in K:
    model = KMeans(n_clusters=k)
    model.fit(mds_df)
    distortions.append(model.inertia_)


plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

#elbow occurs at 3 --> select k=3

In [None]:
#kmeans clustering model 
model = KMeans(n_clusters=3)
model.fit(mds_df)
#fitting prediction to the data
mds_df['label']= model.predict(mds_df)
mds_df

In [None]:
#sumarrizing results
top10v.columns = ['brand', "total", "pct"]
mds_df["pct_mentions"] = top10v.pct.to_list()
mds_df

In [None]:
#first clustering plot

# define and map colors
colors = ['#DF2020', '#81DF20', '#2095DF']
mds_df['c'] = mds_df.label.map({0:colors[0], 1:colors[1], 2:colors[2]})

from scipy.spatial import ConvexHull
import numpy as np
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, figsize=(8,8))

    # plot data
    x = mds_df.component0
    y = mds_df.component1
    plt.scatter(x, y, c=mds_df.c, s=100)
    
    n = mds_df.index
    for i, txt in enumerate(n):
        ax.annotate(txt, (x[i], y[i]))  

    # draw enclosure
    for i in mds_df.label.unique(): 
        points = mds_df[mds_df.label == i][['component0', 'component1']].values
        # get convex hull
        hull = ConvexHull(points)
        # get x and y coordinates
        # repeat last point to close the polygon
        x_hull = np.append(points[hull.vertices,0],
                        points[hull.vertices,0][0])
        y_hull = np.append(points[hull.vertices,1],
                        points[hull.vertices,1][0])
        # plot shape
        plt.fill(x_hull, y_hull, alpha=0.3, c=colors[i])
    
        
    plt.xlim(0,200)
    plt.ylim(0,200)

In [None]:
mds_df

In [None]:
#second clustering plot including mention volumetry

# define and map colors
colors = ['#DF2020', '#81DF20', '#2095DF']
mds_df['c'] = mds_df.label.map({0:colors[0], 1:colors[1], 2:colors[2]})

mds_df['pct_mentions'] = pd.to_numeric(mds_df['pct_mentions'])
from scipy.spatial import ConvexHull
import numpy as np
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, figsize=(8,8))

    # plot data
    x = mds_df.component0
    y = mds_df.component1
    plt.scatter(x, y, c=mds_df.c, s=mds_df.pct_mentions*50)
    
    n = mds_df.index
    for i, txt in enumerate(n):
        ax.annotate(txt, (x[i], y[i])) 

    # draw enclosure
    for i in mds_df.label.unique(): 
        points = mds_df[mds_df.label == i][['component0', 'component1']].values
        # get convex hull
        hull = ConvexHull(points)
        # get x and y coordinates
        # repeat last point to close the polygon
        x_hull = np.append(points[hull.vertices,0],
                        points[hull.vertices,0][0])
        y_hull = np.append(points[hull.vertices,1],
                        points[hull.vertices,1][0])
        # plot shape
        plt.fill(x_hull, y_hull, alpha=0.3, c=colors[i])
    
        
    plt.xlim(0,200)
    plt.ylim(0,200)

In [None]:
#graph 
import plotly.express as px
import plotly.io as pio
pio.renderers
pio.renderers.default = "notebook_connected"

mds_df['label'] = mds_df['label'].astype(str)
fig = px.scatter(mds_df , x="component0", y= "component1", text = mds_df.index, color = "label")
fig.update_traces(textposition='top center')

fig.update_layout(
    height=800,
    title_text='Brand Co-Mentions in the Forum for Finding the Ideal Car',
    yaxis_title = None,
    xaxis_title = None
)

fig.update_traces(textfont_size=14)

fig.show()