# Cross-Domain Sentiment Analysis Study
NLP Group Project

Atulya Shetty, Payton Walker

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import gzip
import numpy as np
from sklearn.utils import shuffle

In [2]:
"""
    Returns train test split for Amazon reviews
"""
def load_amz(test_size=0.20):
    try:
        with gzip.open("data/final/Amazon.pkl", 'rb') as az:
            reviews = pickle.load(az)
            X, y = reviews.reviewText, reviews.overall
            return train_test_split(X, y, test_size= test_size, random_state=1)
    except FileNotFoundError:
        """
            This block shouldn't execute since we should already have a  pickled file with Amazon reviews.
            If the pickle file is missing for some reason, ensure that the Electronics_5.json is presnet.
        """
        stream = pd.read_json('Electronics_5.json', lines = True, chunksize=10000)
    
        """
            Only extract reviews that are verified by Amazon and which have been voted helpful by users
            Then filter out the review text and rating 
        """

        amazon_df = [df[(df.verified == True) & (df.vote.isna() == False)] 
                     for df in stream]
        reviews_df = [df[['reviewText', 'overall']] for df in amazon_df[:150]]
        reviews_df = pd.concat(reviews_df, sort=False)
        
        """
            Pickle the list so that we don't have extract the data again
        """
        with gzip.open("data/Amazon.pkl", "wb") as az:
            pickle.dump(reviews_df, az)
            X, y = reviews_df.reviewText, reviews_df.overall
        return train_test_split(X, y, test_size=0.20, random_state=1)

In [3]:
def split_amazon_data():
    f = gzip.open("data/final/Amazon.pkl", 'rb')
    reviews = pickle.load(f)
    df = reviews[['reviewText', 'overall']]
    df.loc[:, 'overall'] = df.overall.apply(lambda x : 1 if x == 4 or x == 5 else 0)
    df_pos = df.loc[df['overall'] == 1][:10000]
    df_neg = df.loc[df['overall'] == 0][:10000]
    df_comb = pd.concat([df_pos[:5000], df_neg[:5000]], axis=0)
    df_comb2 = pd.concat([df_pos[5000:], df_neg[5000:]], axis=0)
    df_comb = shuffle(df_comb, random_state=12)
    df_comb2 = shuffle(df_comb2, random_state=12)

    
    df_comb.to_csv(r'data/S1/test.csv', index=False)
    df_comb.to_csv(r'data/S2/test.csv', index=False)
    df_comb.to_csv(r'data/S3/test.csv', index=False)
    df_comb.to_csv(r'data/S4/test.csv', index=False)

split_amazon_data()

FileNotFoundError: [Errno 2] No such file or directory: 'data/final/Amazon.pkl'

In [349]:
def imdb_data():
    imdb_file = 'data/final/imdb.csv'
    imdb_df = pd.read_csv(imdb_file)
    X, y = imdb_df.review, imdb_df.sentiment

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)    
    df = pd.concat([X_train, y_train], axis=1)
    df_dev = pd.concat([X_test, y_test], axis=1)

    """
         Save as csv file
    """
    df.to_csv(r'data/S1/train.csv', index=False)
    df_dev.to_csv(r'data/S1/dev.csv', index=False)

In [350]:
imdb_data()

In [354]:
def yelp_data():
    
    """ 
        Parameters:
        test_size = float, optional (default=0.2)
        
        Returns:
        Tuple containing training and test data for Yelp reviews
        
    """
    yelp_file = 'data/final/yelp.csv'
    yelp_df = pd.read_csv(yelp_file)
    yelp_df.loc[:, 'stars'] = yelp_df.stars.apply(lambda x : 1 if x == 4 or x == 5 else 0)
    yelp_df = yelp_df[['text', 'stars']]
    X, y = yelp_df.text, yelp_df.stars
    """
        Split into train, test and validation set
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)    
    df = pd.concat([X_train, y_train], axis=1)
    df_dev = pd.concat([X_test, y_test], axis=1)

    """
        Save as csv file
    """
    df.to_csv(r'data/S2/train.csv', index=False)
    df_dev.to_csv(r'data/S2/dev.csv', index=False)
    print(df.shape)
    print(df_dev.shape)


In [352]:
def combine_yelp_imbd():
    """
        Read Yelp data and filter out only reviews and ratings
    """
    
    yelp_file = 'data/final/yelp.csv'
    yelp_df = pd.read_csv(yelp_file)
    yelp_df = yelp_df[['text', 'stars']]
    yelp_df.loc[:, 'stars'] = yelp_df.stars.apply(lambda x : 1 if x == 4 or x == 5 else 0)
    
    """ 
        Read IMDB data and filter out reviews and ratings 
        Change columns name to match Yelp data since we will be combining the two datasets
    """
    
    imdb_file = 'data/final/imdb.csv'
    imdb_df = pd.read_csv(imdb_file)
    imdb_df = imdb_df[['review', 'sentiment']]
    imbd_df = imdb_df.rename(columns={'review':'text', 'sentiment':'stars'})
    imbd_df.loc[:, 'stars'] = imbd_df.stars.apply(lambda x : 1 if x == "positive" else 0)

    """
        Combine Yelp and IMDB reviews
    """
    
    yelp_imdb = pd.concat([yelp_df, imbd_df], axis=0)
    X, y = yelp_imdb.text, yelp_imdb.stars
    
    """
        Split into train, test and validation set
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)    
    
    """
        Set up dataframes for train, test and validation set
    """
    
    df = pd.concat([X_train, y_train], axis=1)
    df_dev = pd.concat([X_test, y_test], axis=1)
    
    
    """
        Save train, test and validation set as csv file
    """
    df.to_csv(r'data/S3/train.csv', index=False)
    df_dev.to_csv(r'data/S3/dev.csv', index=False)
    print(df.shape)
    print(df_dev.shape)

In [353]:
combine_yelp_imbd()

(48000, 2)
(12000, 2)


In [401]:
def combine_yelp_imdb_amz():
    """
        Read Yelp data and filter out only reviews and ratings
    """

    yelp_file = 'data/final/yelp.csv'
    yelp_df = pd.read_csv(yelp_file)
    yelp_df = yelp_df[['text', 'stars']]
    yelp_df.loc[:, 'stars'] = yelp_df.stars.apply(lambda x : 1 if x == 4 or x == 5 else 0)

    """ 
        Read IMDB data and filter out reviews and ratings 
        Change columns name to match Yelp data since we will be combining the two datasets
    """

    imdb_file = 'data/final/imdb.csv'
    imdb_df = pd.read_csv(imdb_file)
    imdb_df = imdb_df[['review', 'sentiment']]
    imbd_df = imdb_df.rename(columns={'review':'text', 'sentiment':'stars'})
    imbd_df.loc[:, 'stars'] = imbd_df.stars.apply(lambda x : 1 if x == "positive" else 0)


    f = gzip.open("data/final/Amazon.pkl", 'rb')
    reviews = pickle.load(f)
    df = reviews[['reviewText', 'overall']]
    df.loc[:, 'overall'] = df.overall.apply(lambda x : 1 if x == 4 or x == 5 else 0)
    df_pos = df.loc[df['overall'] == 1][10000:]
    df_neg = df.loc[df['overall'] == 0][10000:]
    df_comb = pd.concat([df_pos[:10000], df_neg[:10000]], axis=0)
    df_comb = shuffle(df_comb, random_state=12)
    df_comb_az = df_comb.rename(columns={'reviewText':'text', 'overall':'stars'})

    df_ayi = pd.concat([imbd_df, df_comb_az, yelp_df ], axis=0, sort=False)

    X, y = df_ayi.text, df_ayi.stars
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)    
    df = pd.concat([X_train, y_train], axis=1)
    df_dev = pd.concat([X_test, y_test], axis=1)
    df.to_csv(r'data/S4/train.csv', index=False)
    df_dev.to_csv(r'data/S4/dev.csv', index=False)

In [None]:
combine_yelp_imdb_amz