# Cross-Domain Sentiment Analysis Study
NLP Group Project

Atulya Shetty, Payton Walker

In [163]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import gzip
import numpy as np
from sklearn.utils import shuffle


In [124]:
#TODO: Normalize data
def imdb_data(test_size=0.2):
    """ 
        Retrieves IMDB data movie review dataset
        
        Parameters:
        test_size = float, optional (default=0.2)
        
        Returns:
        Tuple containing training and test data for IMBD movie            [po;iuyreviews and their sentiment
        
    """
    imdb_file = 'data/imdb.csv'
    imdb_df = pd.read_csv(imdb_file)
    X, y = imdb_df.review, imdb_df.sentiment
    return train_test_split(X, y, test_size= 0.2, random_state=1)

In [125]:
imdb_train_review, imdb_test_review, imdb_train_sent, imdb_test_sent = imdb_data()

In [126]:
#TODO: Normalize data
def yelp_data():
    
    """ 
        Parameters:
        test_size = float, optional (default=0.2)
        
        Returns:
        Tuple containing training and test data for Yelp reviews
        
    """
    yelp_file = 'data/yelp.csv'
    yelp_df = pd.read_csv(yelp_file)
    X, y = yelp_df.text, yelp_df.stars
    """
        Split into train, test and validation set
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
    df = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)
    df_val = pd.concat([X_val, y_val], axis=1)

    """
        Save as csv file
    """
    df.to_csv(r'data/yelp/train.csv')
    df_test.to_csv(r'data/yelp/test.csv')
    df_val.to_csv(r'data/yelp/dev.csv')



In [127]:
yelp_data()

In [128]:
"""
    Returns train test split for Amazon reviews
"""
def load_amz(test_size=0.20):
    try:
        with gzip.open("data/Amazon.pkl", 'rb') as az:
            reviews = pickle.load(az)
            X, y = reviews.reviewText, reviews.overall
            return train_test_split(X, y, test_size= test_size, random_state=1)
    except FileNotFoundError:
        """
            This block shouldn't execute since we should already have a  pickled file with Amazon reviews.
            If the pickle file is missing for some reason, ensure that the Electronics_5.json is presnet.
        """
        stream = pd.read_json('Electronics_5.json', lines = True, chunksize=10000)
    
        """
            Only extract reviews that are verified by Amazon and which have been voted helpful by users
            Then filter out the review text and rating 
        """

        amazon_df = [df[(df.verified == True) & (df.vote.isna() == False)] 
                     for df in stream]
        reviews_df = [df[['reviewText', 'overall']] for df in amazon_df[:150]]
        reviews_df = pd.concat(reviews_df, sort=False)
        
        """
            Pickle the list so that we don't have extract the data again
        """
        with gzip.open("data/Amazon.pkl", "wb") as az:
            pickle.dump(reviews_df, az)
            X, y = reviews_df.reviewText, reviews_df.overall
        return train_test_split(X, y, test_size=0.20, random_state=1)

In [129]:
amz_train_review, amz_test_review, amz_train_ratings, amz_test_ratings = load_amz()

In [130]:
def split_amazon_data():
    f = gzip.open("data/Amazon.pkl", 'rb')
    reviews = pickle.load(f)
    df = reviews[['reviewText', 'overall']]
    df.loc[:, 'overall'] = df.overall.apply(lambda x : 1 if x == 4 or x == 5 else 0)
    df_list = np.array_split(df, 5)
    for idx, df in enumerate(df_list):
        df.to_csv(f'data/Amazon/Amazon_set{idx+1}.csv',index=False)

In [131]:
split_amazon_data()

(50000, 2)

FileNotFoundError: [Errno 2] No such file or directory: 'data/yelp_imbd/test.csv'

In [187]:
def combine_yelp_imbd():
    """
        Read Yelp data and filter out only reviews and ratings
    """
    
    yelp_file = 'data/yelp.csv'
    yelp_df = pd.read_csv(yelp_file)
    yelp_df = yelp_df[['text', 'stars']]
    yelp_df.loc[:, 'stars'] = yelp_df.stars.apply(lambda x : 1 if x == 4 or x == 5 else 0)
    
    """ 
        Read IMDB data and filter out reviews and ratings 
        Change columns name to match Yelp data since we will be combining the two datasets
    """
    
    imdb_file = 'data/imdb.csv'
    imdb_df = pd.read_csv(imdb_file)
    imdb_df = imdb_df[['review', 'sentiment']]
    imbd_df = imdb_df.rename(columns={'review':'text', 'sentiment':'stars'})
    imbd_df.loc[:, 'stars'] = imbd_df.stars.apply(lambda x : 1 if x == "positive" else 0)

    """
        Combine Yelp and IMDB reviews
    """
    
    yelp_imdb = pd.concat([yelp_df, imbd_df], axis=0)
    X, y = yelp_imdb.text, yelp_imdb.stars
    
    """
        Split into train, test and validation set
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
    
    """
        Set up dataframes for train, test and validation set
    """
    
    df = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)
    df_dev = pd.concat([X_val, y_val], axis=1)
    
    
    """
        Save train, test and validation set as csv file
    """
    df.to_csv(r'data/yelp_imdb/train.csv', index=False)
    df_test.to_csv(r'data/yelp_imdb/test.csv', index=False)
    df_dev.to_csv(r'data/yelp_imdb/dev.csv', index=False)

In [188]:
combine_yelp_imbd()