In [81]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Dense, Input, concatenate, Dropout
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import regularizers
from keras.models import load_model
import tensorflow as tf

### **Reading Data**

In [113]:
df = pd.read_csv("train.csv", encoding='latin-1')
df.head()

  df = pd.read_csv("train.csv", encoding='latin-1')


Unnamed: 0,user_name,country,review_title,review_description,designation,points,price,province,region_1,region_2,winery,variety,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,,Australia,Andrew Peace 2007 Peace Family Vineyard Chardo...,"Classic Chardonnay aromas of apple, pear and h...",Peace Family Vineyard,83,10,Australia Other,South Eastern Australia,,Andrew Peace,Chardonnay,,,,,,
1,@wawinereport,US,North by Northwest 2014 Red (Columbia Valley (...,This wine is near equal parts Syrah and Merlot...,,89,15,Washington,Columbia Valley (WA),Columbia Valley,North by Northwest,Red Blend,,,,,,
2,,Italy,Renato Ratti 2007 Conca (Barolo),Barolo Conca opens with inky dark concentratio...,Conca,94,80,Piedmont,Barolo,,Renato Ratti,Nebbiolo,,,,,,
3,@vossroger,France,Domaine l'Ancienne Cure 2010 L'Abbaye White (B...,It's impressive what a small addition of Sauvi...,L'Abbaye,87,22,Southwest France,Bergerac Sec,,Domaine l'Ancienne Cure,Bordeaux-style White Blend,,,,,,
4,@vossroger,France,Château du Cèdre 2012 Le Cèdre Vintage Malbec ...,"This ripe, sweet wine is rich and full of drie...",Le Cèdre Vintage,88,33,France Other,Vin de Liqueur,,Château du Cèdre,Malbec,,,,,,


### **Cleaning the data**

In [110]:
def cleaning(df):
  try:
    i = 0
    for x in df["Unnamed: 12"].isna():
        if not x:
            df = df.drop(i)
        i += 1

    df = df.iloc[:, :12]
    df = df.dropna(subset=['winery'])
  except:
    pass

  df = df.drop(["user_name", "designation", "province", "region_1", "region_2", "winery"], axis=1)

  df['price'] = pd.to_numeric(df['price'], errors='coerce')
  df['price'] = df['price'].astype(float)

  df['points'] = pd.to_numeric(df['points'], errors='coerce')
  df['points'] = df['points'].astype(float)

  df['country'].fillna(df['country'].mode()[0], inplace=True)
  df['price'].fillna(df['price'].mean(), inplace=True)
  df['points'].fillna(df['points'].mean(), inplace=True)

  return df

df = cleaning(df)

### **Feature Engineering**

In [101]:
def feature_egg(df):
  label_encoder = LabelEncoder()
  df["country"] = label_encoder.fit_transform(df["country"])
  df["review"] = df["review_title"]+df["review_description"]
  df.drop(["review_title", "review_description"], axis=1, inplace=True)

  def category(points):
    if points == 80 and points < 83:
        return 0
    elif points == 83 and points < 87:
        return 1
    elif points == 87 and points < 90:
        return 2
    elif points == 90 and points < 94:
        return 3
    elif points == 94 and points < 98:
        return 4
    else:
        return 5

  df["rating"] = df["points"].apply(category)
  df.drop(["points"], axis=1, inplace=True)

  return df

df = feature_egg(df)


### **Preprocessing the data**

In [89]:
# Preprocess the data
text_data = df['review'].values
ratings = df['rating'].values
prices = df['price'].values
countries = df['country'].values
labels = df['variety'].values

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [86]:
# Split the dataset into train and test sets
X_text_train, X_text_test, X_rating_train, X_rating_test, X_price_train, X_price_test, X_country_train, X_country_test, y_train, y_test = train_test_split(
    text_data, ratings, prices, countries, encoded_labels, test_size=0.2, random_state=42
)

In [87]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_text_train)

In [27]:
# Convert text data to sequences
X_text_train_sequences = tokenizer.texts_to_sequences(X_text_train)
X_text_test_sequences = tokenizer.texts_to_sequences(X_text_test)

In [29]:
# Pad sequences to a fixed length
max_sequence_length = 400
X_text_train_padded = pad_sequences(X_text_train_sequences, maxlen=max_sequence_length)
X_text_test_padded = pad_sequences(X_text_test_sequences, maxlen=max_sequence_length)

In [30]:
# Build the model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 400

# Text input branch
text_input = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_sequence_length)(text_input)
lstm_layer = LSTM(400)(embedding_layer)

In [31]:
# Scale numerical features
scaler = MinMaxScaler()
X_rating_train_scaled = scaler.fit_transform(X_rating_train.reshape(-1, 1))
X_rating_test_scaled = scaler.transform(X_rating_test.reshape(-1, 1))
X_price_train_scaled = scaler.fit_transform(X_price_train.reshape(-1, 1))
X_price_test_scaled = scaler.transform(X_price_test.reshape(-1, 1))
X_country_train_scaled = scaler.fit_transform(X_country_train.reshape(-1, 1))
X_country_test_scaled = scaler.transform(X_country_test.reshape(-1, 1))

In [32]:
# Numerical input branch
rating_input = Input(shape=(1,))
price_input = Input(shape=(1,))
country_input = Input(shape=(1,))

In [33]:
# Concatenate text and numerical inputs
concatenated = concatenate([lstm_layer, rating_input, price_input, country_input])

In [46]:
# Fully connected layers
dense1 = Dense(64, activation='relu', kernel_regularizer=regularizers.l1(0.001))(concatenated)
drop = Dropout(0.25)(dense1)
dense2 = Dense(32, activation='relu')(drop)
drop1 = Dropout(0.1)(dense2)
output = Dense(len(label_encoder.classes_), activation='softmax')(drop1)

In [47]:
from keras.models import Sequential, Model
# Create the model
model = Model(inputs=[text_input, rating_input, price_input, country_input], outputs=output)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [48]:
epochs = 10
batch_size = 64
model.fit([X_text_train_padded, X_rating_train_scaled, X_price_train_scaled, X_country_train_scaled], y_train,
          epochs=epochs, batch_size=batch_size, validation_data=([X_text_test_padded, X_rating_test_scaled, X_price_test_scaled, X_country_test_scaled], y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb6ce1122f0>

In [49]:
loss, accuracy = model.evaluate([X_text_test_padded, X_rating_test_scaled, X_price_test_scaled, X_country_test_scaled], y_test, verbose=0)
print(f'Loss: {loss:.4f}')
print(f'Accuracy: {accuracy:.4f}')

Loss: 0.2660
Accuracy: 0.9651


# **Model Accuracy: 0.965**

In [51]:
# Save the model
model.save('model.h5')

# <font color="Red">=============Prediction on given dataset===============</font>

### **Preprocessing the data**

In [102]:
def preprocessing(df):
  # Preprocess the data
  text_test = df['review'].values
  rating_test = df['rating'].values
  price_test = df['price'].values
  country_test = df['country'].values

  # Tokenize the text data
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(text_test)
  text_test_sequences = tokenizer.texts_to_sequences(text_test)

  # Pad sequences to a fixed length
  max_sequence_length = 400
  text_test_padded = pad_sequences(text_test_sequences, maxlen=max_sequence_length)

  # Scale numerical features
  scaler = MinMaxScaler()

  rating_test_scaled = scaler.fit_transform(rating_test.reshape(-1, 1))
  price_test_scaled = scaler.fit_transform(price_test.reshape(-1, 1))
  country_test_scaled = scaler.fit_transform(country_test.reshape(-1, 1))

  return [text_test_padded, rating_test_scaled, price_test_scaled, country_test_scaled]



### **Reading Data**

In [None]:
df = pd.read_csv("test.csv")
df = cleaning(df)
df = feature_egg(df)
preprocessing(df)

In [103]:
loaded_model = load_model('model.h5')
data = preprocessing(df)
predictions = loaded_model.predict(data)

predicted_labels = label_encoder.inverse_transform(predictions.argmax(axis=1))


print('Predicted Varieties:')
print(predicted_labels)

Predicted Varieties:
['White Blend' 'White Blend' 'Chardonnay' ... 'Pinot Noir'
 'Sparkling Blend' 'Pinot Noir']


In [112]:
df['variety'] = predicted_labels
df.to_csv("prediction.csv", index=False)