In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import sklearn
import string
import warnings
import re
from scipy import sparse
from IPython.display import display, Latex, Markdown
warnings.filterwarnings('ignore')
import data_cleaning as dc
import review_score_analysis as rs

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('omw-1.4')
# nltk.download('punkt')
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('tagsets_json')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/chasty2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/chasty2/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/chasty2/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /home/chasty2/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/chasty2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/chasty2/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/chasty2/nltk_data...
[nltk_data]   Package averaged_perceptron_ta

True

In [13]:
### Text Processing

def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    """ Normalizes case and handles punctuation
    Inputs:
        text: str: raw text
        lemmatizer: an instance of a class implementing the lemmatize() method
                    (the default argument is of type nltk.stem.wordnet.WordNetLemmatizer)
    Outputs:
        list(str): tokenized text
    """
    posMapping = {
    # "First_Letter by nltk.pos_tag":"POS_for_lemmatizer"
        "N":'n',
        "V":'v',
        "J":'a',
        "R":'r'
    }

    
    # Create regex to catch URLs
    url_regex = re.compile(r'''(
        (?:https?://)?        ## Optionally match http:// or https://
        (?:www\.)?            ## Optionally match www.
        [\w.-]+\.\w+          ## Match multiple domains (example.com or sub.domain.co.uk)
        (?:[/?#][^\s]*)?      ## Optionally match paths, queries, or fragments
    )''', re.VERBOSE)
    
    ### Process string
    # Remove URLs
    text = url_regex.sub("", text).strip()
    # Remove all ('s) e.g. she's -> she
    text = re.sub("'s", "", text).strip()
    # Omit other apostrophes e.g. don't -> dont
    text = re.sub("'", "", text).strip()
    # swap all other punctuation with ' '
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    # Set to lowercase
    text = str.lower(text)
    
    ### Process tokens
    # tokenize string
    tokenized_text = nltk.word_tokenize(text)
    # Tag tokens
    tokenized_text = nltk.pos_tag(tokenized_text)
    # lemmatize tokens, converting pos tags based on mappings above
    lemmatized_tokens = []
    for word,tag in tokenized_text:
        try:
            lemma = lemmatizer.lemmatize(word, pos=posMapping[tag[0]])
        except KeyError:
            # Anything not caught by posMapping dict has pos 'n'
            lemma = lemmatizer.lemmatize(word, pos='n')
        # except:
        #     # Ignore other exceptions
        #     continue
        lemmatized_tokens.append(lemma)

    return lemmatized_tokens

def process_all(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    """ process all text in the dataframe using process() function.
    Inputs
        df: pd.DataFrame: dataframe containing a column 'text' loaded from the CSV file
        lemmatizer: an instance of a class implementing the lemmatize() method
                    (the default argument is of type nltk.stem.wordnet.WordNetLemmatizer)
    Outputs
        pd.DataFrame: dataframe in which the values of text column have been changed from str to list(str),
                        the output from process() function. Other columns are unaffected.
    """
    df['text'] = df['text'].apply(process)
    return df

### Feature Construction
def create_features(processed_reviews, stop_words):
    """ creates the feature matrix using the processed review text
    Inputs:
        processed_reviews: pd.DataFrame: processed reviews read from train/test  file, containing the column 'text'
        stop_words: list(str): stop_words by nltk stopwords (after processing)
    Outputs:
        sklearn.feature_extraction.text.TfidfVectorizer: the TfidfVectorizer object used
            we need this to tranform test reviews in the same way as train reviews
        scipy.sparse.csr.csr_matrix: sparse bag-of-words TF-IDF feature matrix
    """
    # Convert processed tweets text values to list of strings, with one tweet per string
    reviews_list = processed_reviews["text"].apply(lambda x: ' '.join(x)).tolist()

    # Learn vocabulary and idf, return document-term matrix
    tfidf = sklearn.feature_extraction.text.TfidfVectorizer(
        min_df=2, lowercase=False, stop_words=stop_words
    )
    X = tfidf.fit_transform(reviews_list)

    return tfidf, X


def create_labels(avg_scores_df):
    """ creates the class labels from avg_review_score
    Inputs:
        avg_scores_df: pd.DataFrame: reviews read from training df, containing the column 'avg_review_score'
    Outputs:
        numpy.ndarray(int): series of class labels 
        1 for restaurants with avg_review_score >= 4.5
        0 otherwise
    """
    # Apply vectorized  operation to score restaurants
    label_series = (avg_scores_df['avg_review_score'] >= 4.5).astype(int)

    return label_series

### Classification
def learn_classifier(X_train, y_train, kernel):
    """ learns a classifier from the input features and labels using the kernel function supplied
    Inputs:
        X_train: scipy.sparse.csr.csr_matrix: sparse matrix of features, output of create_features()
        y_train: numpy.ndarray(int): dense binary vector of class labels, output of create_labels()
        kernel: str: kernel function to be used with classifier. [linear|poly|rbf|sigmoid]
    Outputs:
        sklearn.svm.SVC: classifier learnt from data
    """
    
    classifier = sklearn.svm.SVC(kernel=kernel)
    classifier.fit(X_train, y_train)

    return classifier

def evaluate_classifier(classifier, X_validation, y_validation):
    """ evaluates a classifier based on a supplied validation data
    Inputs:
        classifier: sklearn.svm.classes.SVC: classifer to evaluate
        X_validation: scipy.sparse.csr.csr_matrix: sparse matrix of features
        y_validation: numpy.ndarray(int): dense binary vector of class labels
    Outputs:
        double: accuracy of classifier on the validation data
    """
    # Run classification of predicted political party based on each tweet
    predicted_labels = classifier.predict(X_validation)

    # Calculate accuracy of predictions
    accuracy = sklearn.metrics.accuracy_score(y_validation, predicted_labels)
    
    return accuracy

class MajorityLabelClassifier():
    """
    A classifier that predicts the mode of training labels
    """
    def __init__(self):
        """
        Initialize your parameter here
        """
        # Declare uninitialized mode
        self.mode = np.nan

    def fit(self, X, y):
        """
        Implement fit by taking training data X and their labels y and finding the mode of y
        i.e. store your learned parameter
        """
        # Convert y to a series, if it is not already
        y = pd.Series(y)
        
        # Count number of values in each label
        counts = y.value_counts()
        # Set mode to index (i.e. label) of most frequently occuring value
        self.mode = counts.idxmax()

    def predict(self, X):
        """
        Implement to give the mode of training labels as a prediction for each data instance in X
        return labels
        """
        predicted_labels = []
        for value in X:
            predicted_labels.append(self.mode)


        return predicted_labels

In [6]:
# Load dataframes
chunk_size = 100_000
restaurants_df = dc.load("data/filtered_restaurants.json", chunk_size)
reviews_df = dc.load("data/filtered_reviews.json", chunk_size)
avg_scores_df = rs.calculate_average_review_score(reviews_df)

In [7]:
avg_scores_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4371248 entries, 0 to 4371247
Data columns (total 10 columns):
 #   Column            Dtype         
---  ------            -----         
 0   review_id         object        
 1   user_id           object        
 2   business_id       object        
 3   stars             int64         
 4   useful            int64         
 5   funny             int64         
 6   cool              int64         
 7   text              object        
 8   date              datetime64[ns]
 9   avg_review_score  float64       
dtypes: datetime64[ns](1), float64(1), int64(4), object(4)
memory usage: 333.5+ MB


In [8]:
### Build and Test Model

# This takes a while, so we're gonna use 20K reviews for testing
processed_reviews = process_all(reviews_df[0:20000])
processed_reviews["text"].head()

0    [if, you, decide, to, eat, here, just, be, awa...
1    [family, diner, have, the, buffet, eclectic, a...
2    [wow, yummy, different, delicious, our, favori...
3    [cute, interior, and, owner, give, u, tour, of...
4    [i, be, a, long, term, frequent, customer, of,...
Name: text, dtype: object

In [9]:
stopwords=nltk.corpus.stopwords.words('english')
processed_stopwords = list(np.concatenate([process(word) for word in stopwords]))
(tfidf, X) = create_features(processed_reviews, processed_stopwords)

In [10]:
## TODO: Create a label for each review by merging avg score with reviews df
y = create_labels(avg_scores_df[0:20000])
y

0        0
1        0
2        0
3        0
4        0
        ..
19995    0
19996    0
19997    0
19998    0
19999    0
Name: avg_review_score, Length: 20000, dtype: int64

In [15]:
# Compare against majoritylabelclassifier

baselineClf = MajorityLabelClassifier()
# Use fit and predict methods to get predictions and compare it with the true labels y
baselineClf.fit(X,y)
predicted_labels = baselineClf.predict(X)

baseline = sklearn.metrics.accuracy_score(y, predicted_labels)

In [11]:
# TODO: learn and evaluate classifier
review_classifier = learn_classifier(X, y, 'poly')

accuracy = evaluate_classifier(review_classifier, X, y)
print(accuracy)

0.9994
