## Prepared by Ishaan Arora (ia2419)
For any doubts drop a line on ia2419@columbia.edu

In [0]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import drive
from gensim import utils
import gensim.downloader as api
import pandas as pd
import numpy as np

## Load dataset 

In [1]:
drive.mount('/gdrive')

# Load the dataset into a pandas dataframe.
df = pd.read_csv("/gdrive/My Drive/winemag-data-130k-v2.csv", delimiter=',')

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.sample(10)

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
Number of training sentences: 129,971



Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
90212,90212,Argentina,Dilute yet raisiny aromas struggle to stir int...,Fincas,84,15.0,Mendoza Province,Mendoza,,Michael Schachner,@wineschach,Alfredo Roca 2016 Fincas Pinot Noir (Mendoza),Pinot Noir,Alfredo Roca
57273,57273,Slovenia,While savory and smoky on the nose with subdue...,,83,20.0,Štajerska,,,Anna Lee C. Iijima,,Verus Vineyards 2009 Furmint (Štajerska),Furmint,Verus Vineyards
117049,117049,US,This is sophisticated stuff that lives up to O...,Oakville,93,75.0,California,Napa Valley,Napa,Jim Gordon,@gordone_cellars,Hoopes 2013 Oakville Cabernet Sauvignon (Napa ...,Cabernet Sauvignon,Hoopes
15371,15371,Spain,Bright plum and berry aromas are clear but bas...,Granate Roble,86,10.0,Northern Spain,Ribera del Duero,,Michael Schachner,@wineschach,Bodegas Fuentespina 2013 Granate Roble (Riber...,Tempranillo Blend,Bodegas Fuentespina
90924,90924,Argentina,"Rusty in color, this wine features aromas of c...",,81,8.0,Mendoza Province,Mendoza,,Michael Schachner,@wineschach,Region 1 2010 Malbec (Mendoza),Malbec,Region 1
35521,35521,US,"All stainless fermented, with the addition of ...",,87,19.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Hawkins Cellars 2015 Pinot Gris (Willamette Va...,Pinot Gris,Hawkins Cellars
103982,103982,US,Candied red and purple fruit meets mocha and c...,Goodchild Vineyard,87,30.0,California,Santa Barbara County,Central Coast,Matt Kettmann,@mattkettmann,Steele 2013 Goodchild Vineyard Pinot Noir (San...,Pinot Noir,Steele
14617,14617,US,This aromatically brooding wine offers aromas ...,Celebration Series,91,32.0,Washington,Columbia Valley (WA),Columbia Valley,Sean P. Sullivan,@wawinereport,Armstrong Family 2012 Celebration Series Caber...,Cabernet Sauvignon,Armstrong Family
46888,46888,France,This ripe wine with its red fruits and well-ba...,Rosé Brut,94,130.0,Champagne,Champagne,,Roger Voss,@vossroger,Pol Roger 2008 Rosé Brut (Champagne),Champagne Blend,Pol Roger
69990,69990,US,Lots of Russian River character in this supple...,,87,28.0,California,Russian River Valley,Sonoma,Christina Pickard,@winewchristina,MacMurray Ranch 2011 Pinot Noir (Russian River...,Pinot Noir,MacMurray Ranch


## Extract Input and output

In [0]:
X = df.description
y = df.points

## Load pre-trained glove embeddings

In [3]:
w2vmodel = api.load("glove-twitter-25") 

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Generate embeddings for a document by averaging the embeddings of all the possible tokens

In [0]:
def pre_process(df):
  embeddings=[]
  for row in df:
    row_split=utils.simple_preprocess(row)
    embedding=[]
    for token in row_split:
      if token in w2vmodel.wv:
        embedding.append(w2vmodel.wv[token])
    embeddings.append(np.mean(embedding,axis=0))
  return embeddings

## Train Ridge regression on embeddings as input

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = RidgeCV()
train_embed=pre_process(X_train)
test_embed=pre_process(X_test)
model.fit(pre_process(X_train), y_train)
print('Ridge CV Train Score:',model.score(train_embed,y_train))
print('Ridge CV Test Score:',model.score(test_embed,y_test))

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


Ridge CV Train Score: 0.1860380289557075
Ridge CV Test Score: 0.1935526355482986


## Train a TF-IDF vectorizer as a BOW model

In [0]:
tfidf= TfidfVectorizer(analyzer='char',ngram_range=(2,4),stop_words='english',max_features=2000)
train_bow=tfidf.fit_transform(X_train).toarray()
test_bow=tfidf.transform(X_test).toarray()

## Combine BOW and embeddings

In [0]:
X_train_combine = np.append(train_embed, train_bow, axis = 1)
X_test_combine = np.append(test_embed, test_bow, axis = 1)


## Train a Ridge regression model on Combined features

In [9]:
revised_model=RidgeCV()
revised_model.fit(X_train_combine, y_train)
print('Ridge CV Train Score:', revised_model.score(X_train_combine, y_train))
print('Ridge CV Test Score:', revised_model.score(X_test_combine,y_test))

Ridge CV Train Score: 0.6563875768035287
Ridge CV Test Score: 0.6510331927696101


## Discussion :
#### Combined model (word embeddings + BOW) is a huge improvement on embeddings and majority of improvement comes form bag of words 