In [None]:
#importing library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.metrics import r2_score

In [None]:
#loading the dataset
df=pd.read_csv('/kaggle/input/edmunds-car-review/Review.csv')
df

In [None]:
#getting preliminary information about the dataset
df.info()

In [None]:
def process_texts(texts):
    texts=texts.copy()
    #creating a list of most commonly used word in english
    stop_words=stopwords.words('english')
    #replacing any \ with empty and number with empty string
    texts=texts.apply(lambda x:re.sub(r'\\n','',x))
    texts=texts.apply(lambda x:re.sub(r'\d+','',x))
    #removing stop_word from the sentences
    texts=texts.apply(lambda x:" ".join([word for word in x.split() if word.lower() not in stop_words]))

    return texts


def get_sequences(texts):
    tokenizer=Tokenizer(num_words=50000)
    tokenizer.fit_on_texts(texts)
    sequences=tokenizer.texts_to_sequences(texts)
    max_sequence_length=np.max(list(map(lambda x:len(x),sequences)))
    print('The maximum sequence length',max_sequence_length)
    
    sequences=pad_sequences(sequences,maxlen=max_sequence_length,padding='post')
    return sequences


def encode_date(df,column):
    df=df.copy()
    df[column]=pd.to_datetime(df[column],errors='coerce')
    df['ReviewYear']=df[column].apply(lambda x:x.year)
    df['ReviewMonth']=df[column].apply(lambda x:x.month)
    df['ReviewDay']=df[column].apply(lambda x:x.day)
    df=df.drop(column,axis=1)
    return df



In [None]:
def onehot_encode(df,columns):
    df=df.copy()
    for column in columns:
        dummies=pd.get_dummies(df[column],prefix=column)
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
        for column in df.columns:
            if df[column].dtypes=='bool':
                df[column]=df[column].astype(int)
    return df

In [None]:
get_sequences(x['Review'])

In [None]:
df['Date']

In [None]:
def preprocess_inputs(df):
    #creating the copy of the dataset
    df=df.copy()
    #dropping the Reviewer column 
    df=df.drop('Reviewer',axis=1)
    df=df.drop(df.loc[df['Review'].isna(),:].index,axis=0).reset_index(drop=True)
    
    df['Title']=df['Title'].fillna(df['Title'].mode()[0])

    df['Title']=process_texts(df['Title'])
    df['Review']=process_texts(df['Review'])

    title=get_sequences(df['Title'])
    reviews=get_sequences(df['Review'])

    df=df.drop(['Title','Review'],axis=1)
    
    df=onehot_encode(df,['Company','Model'])

    df=encode_date(df,'Date')
    
    
    #Filling missing value with mean of that column


    for column in ['ReviewYear','ReviewMonth','ReviewDay']:
        df[column]=df[column].fillna(df[column].mean())
    
    y=df['Rating']
    x=df.drop('Rating',axis=1)
    
    title_train,title_test,\
    reviews_train,reviews_test,\
    x_train,x_test,\
    y_train,y_test=train_test_split(title,reviews,x,y,train_size=0.7,random_state=123,shuffle=True)


    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns,index=x_train.index)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns,index=x_test.index)


    
    return title_train,title_test,reviews_train,reviews_test,x_train,x_test,y_train,y_test

In [None]:
df.loc[df['Review'].isna(),:].index

In [None]:
#checking for missing value in the dataset
x.isna().sum()

In [None]:
{column:len(df[column].unique()) for column in df.columns}

In [None]:
title_train,title_test,reviews_train,reviews_test,x_train,x_test,y_train,y_test=preprocess_inputs(df)
x_train

In [None]:
title_train.shape

In [None]:
reviews_train.shape

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
y_train

In [None]:
#Constructing the Model


x_inputs=tf.keras.Input(shape=(x_train.shape[1],),name='x_inputs')
dense_1=tf.keras.layers.Dense(64,activation='relu',name='dense_1')(x_inputs)
dense_2=tf.keras.layers.Dense(64,activation='relu',name='dense_2')(dense_1)


title_inputs=tf.keras.Input(shape=(title.shape[1],),name='title_inputs')
title_embedding=tf.keras.layers.Embedding(input_dim=5000,output_dim=64,input_length=title.shape[1],name='title_embedding')(title_inputs)
title_flatten=tf.keras.layers.Flatten(name='title_flatten')(title_embedding)
reviews_inputs=tf.keras.Input(shape=(reviews.shape[1],),name='reviews_inputs')
reviews_embedding=tf.keras.layers.Embedding(input_dim=5000,output_dim=64,input_length=reviews.shape[1],name='reviews_embedding')(reviews_inputs)
reviews_flatten=tf.keras.layers.Flatten(name='reviews_flatten')(reviews_embedding)

concat=tf.keras.layers.concatenate([dense_2,title_flatten,reviews_flatten],name='concatenation')
outputs=tf.keras.layers.Dense(1,activation='linear')(concat)

model=tf.keras.Model(inputs=[x_inputs,title_inputs,reviews_inputs],outputs=outputs)
model.summary()

In [None]:
tf.keras.utils.plot_model(model)

In [None]:
model.compile(optimizer='adam',loss='mse')
history=model.fit([x_train,title_train,reviews_train],y_train,validation_split=0.2,batch_size=32,epochs=10,
                 callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)])

In [None]:
#Constructing the Model
model.evaluate([x_test,title_test,reviews_test].y_test)

In [None]:
x.isna().sum().sum()

In [None]:
title

In [None]:
review