### TASK A

In [79]:
import pandas as pd

# Load the models and brands mapping
mapping_df = pd.read_csv('car_models_and_brands.csv')

# Load the data from Data Scrapper Code
data_df = pd.read_csv('Edmunds_entryluxcar.csv')

# Create a dictionary for mapping models to brands
model_to_brand = {}
for _, row in mapping_df.iterrows():
    brand = row['Brand']
    models = row['Model'].split('|')
    for model in models:
        model_to_brand[model] = brand

model_to_brand['s4'] = 'audi'
model_to_brand['tlx'] = 'acura'
model_to_brand['awd'] = 'cooper'
model_to_brand['daytona'] = 'dodge'
model_to_brand['wrangler'] = 'jeep'

In [40]:
data_df = data_df.drop(columns='Unnamed: 0')

In [41]:
data_df.head()

Unnamed: 0,User_ID,Date,Comment
0,m6user,June 2013,That's why I said given the same scenario as w...
1,fedlawman,June 2013,"Semantics, really, and I do see your point, bu..."
2,m6user,June 2013,"No, i think we agree in general and that's pre..."
3,fedlawman,June 2013,"So this begs the question, before the automobi..."
4,dino001,June 2013,I agree with your previous statement. Many peo...


In [42]:
import matplotlib.pyplot as plt
import numpy as np
import re

# Extract the comment column
comments = data_df[data_df['Comment'].apply(lambda x: isinstance(x, str))]['Comment']

# Tokenize and preprocess the comments (remove punctuation and convert to lowercase)
def preprocesstext(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = re.findall(r'\b\w+\b', text.lower())  # Tokenize and convert to lowercase
    return list(set(words))

# Tokenize and preprocess the comments
all_words = []
for comment in comments:
    words = preprocesstext(comment)
    all_words.extend(words)

# Calculate word frequencies
word_frequencies = pd.Series(all_words).value_counts()

In [198]:
print(word_frequencies)

the                  5123
a                    4600
to                   4214
and                  4089
i                    4020
                     ... 
xfs                     1
supportadjustment       1
bonded                  1
botched                 1
displayexceptthe        1
Length: 23755, dtype: int64


### TASK B: Word frequency table after removing stop words

In [170]:
import matplotlib.pyplot as plt
import numpy as np
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords (you only need to do this once)
nltk.download('stopwords')

# Extract the comment column
comments = data_df[data_df['Comment'].apply(lambda x: isinstance(x, str))]['Comment']

# Get the NLTK English stop words
stop_words = set(stopwords.words('english'))

# Tokenize and preprocess the comments (remove punctuation, convert to lowercase, and remove stop words)
def preprocesstext(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = re.findall(r'\b\w+\b', text.lower())  # Tokenize and convert to lowercase
    nswords = [word for word in words if word not in stop_words]
    nswords = [model_to_brand.get(word, word) for word in nswords]
    return list(set(nswords))

# Tokenize and preprocess the comments, removing stop words
words_nostop = []
for comment in comments:
    words = preprocesstext(comment)
    words_nostop.extend(words)

# Calculate word frequencies
words_nostop_freq = pd.Series(words_nostop).value_counts()

wnf_df = pd.DataFrame(words_nostop_freq)
wnf_df = wnf_df.reset_index()
wnf_df = wnf_df.rename(columns={'index': 'words', 0: 'frequency'})

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/milindbhatia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [175]:
wnf_df.iloc[61:80,:]

top_10_brand_ls = ['bmw','audi','acura','honda','cooper','volkswagen','dodge','subaru','jeep','toyota']

top_10_df = wnf_df[wnf_df['words'].isin(top_10_brand_ls)]
top_10_df = top_10_df.rename(columns={0: 'frequency'})

print(top_10_df)

         words  frequency
2          bmw       1882
7         audi       1521
8        acura       1493
10      cooper       1404
11       honda       1321
46  volkswagen        649
50       dodge        634
51        jeep        633
63      subaru        560
67      toyota        547


### TASK C: LIFT Analysis

In [176]:
token_df = pd.DataFrame(comments)
token_df['Tokenized_Message'] = token_df['Comment'].map(preprocesstext)

def replace_words_in_list(word_list, replacements):
    ls = [replacements.get(word, word) for word in word_list]
    return list(set(ls))

token_df['Mapped_words'] = token_df['Tokenized_Message'].apply(lambda x: replace_words_in_list(x, model_to_brand))


In [196]:
word_distance = 7

def distance_thresh(words_list, word1, word2, max_distance):
    indices_word1 = [i for i, word in enumerate(words_list) if word == word1]
    indices_word2 = [i for i, word in enumerate(words_list) if word == word2]
    for idx1 in indices_word1:
        for idx2 in indices_word2:
            if abs(idx1 - idx2) <= max_distance:
                return True
    return False

def lift_analysis(data,word_A,word_B,max_distance):
    if (word_A == word_B):
        return '-'
    else:
        N = data.shape[0]
        N_A = sum(1 for review in data if word_A in review)
        N_B = sum(1 for review in data if word_B in review)
        N_A_and_B = sum(1 for review in data if distance_thresh(review, word_A, word_B, max_distance) and word_A in review and word_B in review)
    
        lift = (N * N_A_and_B) / (N_A * N_B)

        return np.round(lift,4)

df_lift = []

for index, brand1 in top_10_df.iterrows():
    for index,brand2 in top_10_df.iterrows():
        df_lift.append(lift_analysis(token_df['Mapped_words'],brand1['words'],brand2['words'],word_distance))


In [197]:
import numpy as np
lift_np = np.reshape(df_lift,(10,10))
lift_df = pd.DataFrame(data= lift_np)
for i in range(10):
    lift_df = lift_df.rename(columns = {i: top_10_df.iloc[i, 0]})
    lift_df = lift_df.rename(index = {i: top_10_df.iloc[i, 0]})
lift_df

Unnamed: 0,bmw,audi,acura,cooper,honda,volkswagen,dodge,jeep,subaru,toyota
bmw,-,0.0822,0.1095,0.0251,0.034,0.2993,0.0708,0.0051,0.063,0.3398
audi,0.0822,-,0.4064,0.3644,0.5434,0.1493,2.3019,0.0752,0.0567,0.0145
acura,0.1095,0.4064,-,0.518,1.7892,0.1582,0.1338,0.0064,0.6421,0.0812
cooper,0.0251,0.3644,0.518,-,1.2489,0.0,1.1113,1.2149,0.5754,0.322
honda,0.034,0.5434,1.7892,1.2489,-,0.0206,0.3385,0.3751,0.0734,0.2504
volkswagen,0.2993,0.1493,0.1582,0.0,0.0206,-,0.2149,0.0,0.2596,0.8304
dodge,0.0708,2.3019,0.1338,1.1113,0.3385,0.2149,-,0.0451,0.1019,0.0348
jeep,0.0051,0.0752,0.0064,1.2149,0.3751,0.0,0.0451,-,0.017,0.1742
subaru,0.063,0.0567,0.6421,0.5754,0.0734,0.2596,0.1019,0.017,-,0.0591
toyota,0.3398,0.0145,0.0812,0.322,0.2504,0.8304,0.0348,0.1742,0.0591,-


### BELOW LINES TO BE REMOVED - ONLY FOR DEBUGGING

In [None]:
lift_analysis(token_df['Mapped_words'],'bmw','honda',word_distance)

In [184]:
word_A = 'bmw'
word_B = 'honda'
N = token_df['Mapped_words'].shape[0]
N_A = sum(1 for review in token_df['Mapped_words'] if word_A in review)
N_B = sum(1 for review in token_df['Mapped_words'] if word_B in review)    
N_A_and_B = sum(1 for review in token_df['Mapped_words'] if distance_thresh(review, word_A, word_B, 7) and word_A in review and word_B in review)
lift = (N * N_A_and_B) / (N_A * N_B)
print(N_A_and_B)

14


In [183]:
count_a  = 0
word_A = 'bmw'
word_B = 'honda'
for review in token_df['Mapped_words']:
    if distance_thresh(review, word_A, word_B, 7) and word_A in review and word_B in review:
        print(review)

['read', 'cs', 'interesting', 'sells', 'sh', 'xle', 'link', 'jeep', '4xe', 'mb', '2022', 'type', 'didnt', '3s', 'realize', 'cooper', 'thanks', 'honda', 'many', 'sween', 'sahara', 'bmw', 'toyota', '2001']
['wheel', 'going', 'sh', 'xle', 'next', 'jeep', '4xe', 'lease', '2022', 'type', 'want', 'leasing', 'purchase', 'cooper', 'honda', 'hamster', 'sahara', 'vehicle', 'im', 'bmw', 'toyota', '2001']
['heard', 'driven', 'fedthanks', 'yet', 'hybrid', '2023', 'matter', 'driving', 'first', 'ive', 'event', 'person', 'drive', 'one', 'road', 'fact', 'honda', 'touring', '320i', 'havent', 'youre', 'couple', 'saw', 'bmw', 'test', 'seen']
['biggest', 'doesnt', 'backup', 'camera', 'sh', 'xle', 'jeep', '4xe', '328xi', 'feature', '2022', 'type', 'cooper', 'honda', 'acura', 'miss', 'sahara', 'bmw', 'toyota', '2001']
['sahara', 'honda', 'jeep', '4xe', 'need', 'another', '2022', 'bmw', 'type', 'sh', 'xle', 'toyota', '50000', 'cooper', '2001']
['honda', '2023', 'update', 'check', 'touring', 'bmw', 'fnthanks',