In [1]:
import numpy as np
import pandas as pd
import warnings 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import PIL.ImageOps
from wordcloud import ImageColorGenerator
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import sys
import pickle
warnings.filterwarnings('ignore')

In [2]:
def uciData():

    """
    Reads UCI dataset .txt files (Amazon, IMDB, YELP), 
    Reference for uci_data collection from https://github.com/hoomanm/Sentiment-Analysis
    :returns : DataFrame consisting of reviews and sentiments of user reviews in UCI dataset
    """

    uci_train_data = []
    uci_train_labels = []

    with open("/home/yogesh/fall19/ml660/project/sentiment_labelled_sentences/amazon_cells_labelled.txt", 'r') as f:
        content = f.readlines()
        content = [x.strip() for x in content] 

    for review in content:
        uci_train_data.append(review.split("\t")[0])
        uci_train_labels.append(review.split("\t")[1])

    with open("/home/yogesh/fall19/ml660/project/sentiment_labelled_sentences/imdb_labelled.txt", 'r') as f:
        content = f.readlines()
        content = [x.strip() for x in content] 

    for review in content:
        uci_train_data.append(review.split("\t")[0])
        uci_train_labels.append(review.split("\t")[1])

    with open("/home/yogesh/fall19/ml660/project/sentiment_labelled_sentences/yelp_labelled.txt", 'r') as f:
        content = f.readlines()
        content = [x.strip() for x in content] 

    for review in content:
        uci_train_data.append(review.split("\t")[0])
        uci_train_labels.append(review.split("\t")[1])
        
    df_uci_train = pd.DataFrame(uci_train_data, columns=['reviews'])
    df_uci_labels = pd.DataFrame(uci_train_labels, columns=['sentiment'])
    df_uci = pd.concat([df_uci_train, df_uci_labels], axis = 1)
    return df_uci

In [3]:
def imdbData():


    """
    Reads IMDB dataset .txt files,  
    Modified dataset downloaded from https://github.com/aaronkub/machine-learning-examples/blob/master/imdb-sentiment-analysis/movie_data.tar.gz
    :returns df_imdb_train: DataFrame consisting train samples of user reviews in IMDB dataset
    :returns df_imdb_test: Dataframe consisting test samples of user reviews in IMDB dataset
    """

    reviews_train = []
    for line in open('/home/yogesh/fall19/ml660/project/movie_data/full_train.txt', 'r'):
        reviews_train.append(line.strip())
    df_imdb_train = pd.DataFrame(reviews_train, columns=['reviews'])
    
    reviews_test = []
    for line in open('/home/yogesh/fall19/ml660/project/movie_data/full_test.txt', 'r'):
        reviews_test.append(line.strip())
    df_imdb_test = pd.DataFrame(reviews_train, columns=['reviews'])
    
    return df_imdb_train, df_imdb_test

In [4]:
df_uci, df_imdb_train, df_imdb_test = uciData(), imdbData()[0], imdbData()[1]

In [6]:
def datasetSelection(df_uci, df_imdb_train, df_imdb_test, target):
   
    """
    Different configuration for train/test of the dataset based on the user defined number. 
    Total of 6 different configuration which includes - 
        1. TRAIN AND TEST ON OVERALL DATASET INCLUDING IMDB AND UCI DATASET
        2. TRAIN AND TEST ON UCI DATASET
        3. TRAIN ON IMDB DATASET AND TEST ON UCI DATASET
        4. TRAIN AND TEST ON IMDB DATASET
        5. TRAIN ON 100% IMDB + 80% UCI DATASET AND TEST ON 20% UCI DATASET
        6. TRAIN ON 100% UCI + 80% IMDB DATASET AND TEST ON 20% IMDB DATASET
    :param df_uci = Dataframe of UCI dataset with columns "review"(user review) and "sentiment"(label either positive or negative). 
    :param df_imdb_train = Dataframe of IMDB train dataset with only column "review"(user review). 
    :param df_imdb_test = Dataframe of IMDB test dataset with only column "review"(user review). 
    :returns X_train: Dataframe containing train samples with features extracted from TfidfVectorizer for a particular configurations specified in terminal.  
    :returns X_test: Dataframe containing test samples with features extracted from TfidfVectorizer for a particular configurations specified in terminal. 
    :returns y_train: Dataframe containing train labels for a particular configurations specified in terminal. 
    :returns y_test: Dataframe containing test labels for a particular configurations specified in terminal. 
    """

    if target == 1:
        print("TRAIN AND TEST ON OVERALL DATASET INCLUDING IMDB AND UCI DATASET")
        df_train_data = pd.concat([pd.DataFrame(df_uci['reviews']), df_imdb_train, df_imdb_test], axis=0)
        y = pd.concat([pd.DataFrame(df_uci['sentiment']), pd.DataFrame([1 if i < 12500 else 0 for i in range(25000)], columns=['sentiment']), pd.DataFrame([1 if i < 12500 else 0 for i in range(25000)], columns=['sentiment'])])    
        X = tfidfVectorization(df_train_data, 1) 
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
        y_train['sentiment'], y_test['sentiment'] = y_train.sentiment.astype(float), y_test.sentiment.astype(float)
         
    elif target == 2:
        print("TRAIN AND TEST ON UCI DATASET")
        df_train_data = pd.DataFrame(df_uci['reviews'])
        y = pd.DataFrame(df_uci['sentiment'])
        X = tfidfVectorization(df_train_data, 1) 
        # y = y['sentiment'].to_numpy()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    elif target == 3:
        print("TRAIN ON IMDB DATASET AND TEST ON UCI DATASET")
        df_train_data, df_test_data = shuffle(pd.concat([df_imdb_train, df_imdb_test]), random_state = 7), shuffle(pd.DataFrame(df_uci['reviews']), random_state = 7)
        y_train, y_test = shuffle(pd.concat([pd.DataFrame([1 if i < 12500 else 0 for i in range(25000)], columns=['sentiment']), pd.DataFrame([1 if i < 12500 else 0 for i in range(25000)], columns=['sentiment'])]), random_state = 7), shuffle(pd.DataFrame(df_uci['sentiment'], columns=['sentiment']), random_state = 7)
        y_train['sentiment'], y_test['sentiment'] = y_train.sentiment.astype(float), y_test.sentiment.astype(float)
        X_train, X_test = tfidfVectorization(df_train_data, 2), tfidfVectorization(df_test_data, 2)

    elif target == 4:
        print("TRAIN AND TEST ON IMDB DATASET")
        y_train, y_test = shuffle(pd.DataFrame([1 if i < 12500 else 0 for i in range(25000)], columns=['sentiment']), random_state = 7), shuffle(pd.DataFrame([1 if i < 12500 else 0 for i in range(25000)], columns=['sentiment']), random_state = 7)
        df_imdb_train, df_imdb_test = shuffle(df_imdb_train, random_state = 7), shuffle(df_imdb_test, random_state = 7)
        X_train, X_test = tfidfVectorization(df_imdb_train, 1), tfidfVectorization(df_imdb_test, 1)
        
    elif target == 5:
        print("TRAIN ON 100% IMDB + 80% UCI DATASET AND TEST ON 20% UCI DATASET")
        df_train_data, df_test_data, y_train, y_test = splitTrain(df_uci, df_imdb_test, df_imdb_train, 1)
        y_train['sentiment'] = y_train.sentiment.astype(float)
        y_test['sentiment'] = y_test.sentiment.astype(float)
        X_train, X_test = tfidfVectorization(df_train_data, 3), tfidfVectorization(df_test_data, 3)


    elif target == 6:
        print("TRAIN ON 100% UCI + 80% IMDB DATASET AND TEST ON 20% IMDB DATASET")
        df_train_data, df_test_data, y_train, y_test = splitTrain(df_uci, df_imdb_test, df_imdb_train, 2)
        y_train['sentiment'] = y_train.sentiment.astype(float)
        y_test['sentiment'] = y_test.sentiment.astype(float)
        X_train, X_test = tfidfVectorization(df_train_data, 4), tfidfVectorization(df_test_data, 4)

    else:
        print("done")
    print(X_train.shape)
    print(X_test.shape)
    print(y_test.shape)
    print(y_train.shape) 
    return X_train, X_test, y_train, y_test

In [7]:
def splitTrain(df_uci, df_imdb_test, df_imdb_train, target):


    """
    Splitting the dataset for configuration 5 and 6 into their required respective configurations. 
    :param df_uci: Dataframe of UCI dataset with columns "review"(user review) and "sentiment"(label either positive or negative).
    :param df_imdb_test: Dataframe of IMDB test dataset with only column "review"(user review).
    :param df_imdb_train: Dataframe of IMDB train dataset with only column "review"(user review).
    :param target: Target is an interger value which is either 1 or 2 
    :returns df_train_data: Dataframe containing train samples with features extracted from TfidfVectorizer for a particular configurations specified in terminal. 
    :returns df_test_data: Dataframe containing test samples with features extracted from TfidfVectorizer for a particular configurations specified in terminal. 
    :returns y_train: Dataframe containing train labels for a particular configurations specified in terminal. 
    :returns y_test: Dataframe containing test labels for a particular configurations specified in terminal. 
    """


    if (target == 1): 
        df_train_data = shuffle(pd.concat([pd.DataFrame(df_uci.iloc[0:2401,0]), df_imdb_train, df_imdb_test], axis=0), random_state = 7)
        df_test_data = shuffle(pd.DataFrame(df_uci.iloc[2401:,0]), random_state=7)
        y_train = shuffle(pd.concat([pd.DataFrame(df_uci.iloc[0:2401, 1]), pd.DataFrame([1 if i < 12500 else 0 for i in range(25000)], columns=['sentiment']), pd.DataFrame([1 if i < 12500 else 0 for i in range(25000)], columns=['sentiment'])]), random_state = 7)   
        y_test = shuffle(pd.DataFrame(df_uci.iloc[2401:, 1]), random_state = 7)

    if (target == 2):
        df_train_data = pd.concat([pd.DataFrame(df_uci['reviews']), df_imdb_train, df_imdb_test.iloc[0:20000]], axis=0)
        df_test_data = df_imdb_test.iloc[20000:]
        y_train = pd.concat([pd.DataFrame(df_uci['sentiment']), pd.DataFrame([1 if i < 12500 else 0 for i in range(25000)], columns=['sentiment']), pd.DataFrame([1 if i < 12500 else 0 for i in range(20000)], columns=['sentiment'])])    
        y_test = pd.DataFrame([1 for i in range(5000)], columns=['sentiment'])
    
    return df_train_data, df_test_data, y_train, y_test
