In [1]:
import os
import time
import tarfile
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

class PL04DataLoader_Part_1:
    
    def __init__(self):
        pass
    
    def get_labelled_dataset(self, fold = 0):
        ''' Compile a fold of the data set
        '''
        dataset = []
        for label in ('pos', 'neg'):
            for document in self.get_documents(
                fold = fold,
                label = label,
            ):
                dataset.append((document, label))
        return dataset
    
    def get_documents(self, fold = 0, label = 'pos'):
        ''' Enumerate the raw contents of all data set files.
            Args:
                data_dir: relative or absolute path to the data set folder
                fold: which fold to load (0 to n_folds-1)
                label: 'pos' or 'neg' to
                    select data with positive or negative sentiment
                    polarity
            Return:
                List of tokenised documents, each a list of sentences
                that in turn are lists of tokens
        '''
        raise NotImplementedError

class PL04DataLoader(PL04DataLoader_Part_1):
    
    def get_xval_splits(self):
        ''' Split data with labels for cross-validation
            returns a list of k pairs (training_data, test_data)
            for k cross-validation
        '''
        # load the folds
        folds = []
        for i in range(10):
            folds.append(self.get_labelled_dataset(
                fold = i
            ))
        # create training-test splits
        retval = []
        for i in range(10):
            test_data = folds[i]
            training_data = []
            for j in range(9):
                ij1 = (i+j+1) % 10
                assert ij1 != i
                training_data = training_data + folds[ij1]
            retval.append((training_data, test_data))
        return retval
    
class PL04DataLoaderFromStream(PL04DataLoader):
        
    def __init__(self, tgz_stream, **kwargs):
        super().__init__(**kwargs)
        self.data = {}
        counter = 0
        with tarfile.open(
            mode = 'r|gz',
            fileobj = tgz_stream
        ) as tar_archive:
            for tar_member in tar_archive:
                if counter == 2000:
                    break
                path_components = tar_member.name.split('/')
                filename = path_components[-1]
                if filename.startswith('cv') \
                and filename.endswith('.txt') \
                and '_' in filename:
                    label = path_components[-2]
                    fold = int(filename[2])
                    key = (fold, label)
                    if key not in self.data:
                        self.data[key] = []
                    f = tar_archive.extractfile(tar_member)
                    document = [
                        line.decode('utf-8').split()
                        for line in f.readlines()
                    ]
                    self.data[key].append(document)
                    counter += 1
            
    def get_documents(self, fold = 0, label = 'pos'):
        return self.data[(fold, label)]

class PL04DataLoaderFromTGZ(PL04DataLoaderFromStream):
    
    def __init__(self, data_path, **kwargs):
        with open(data_path, 'rb') as tgz_stream:
            super().__init__(tgz_stream, **kwargs)

In [2]:
dir_entries = os.listdir()
dir_entries.sort()

In [3]:
data_loader = PL04DataLoaderFromTGZ('data.tar.gz')

In [4]:
# test "get_documents()"

def get_document_preview(document, max_length = 72):
    s = []
    count = 0
    reached_limit = False
    for sentence in document:
        for token in sentence:
            if count + len(token) + len(s) > max_length:
                reached_limit = True
                break
            s.append(token)
            count += len(token)
        if reached_limit:
            break
    return '|'.join(s)
    
for label in 'pos neg'.split():
    print(f'== {label} ==')
    print('doc sentences start of first sentence')
    for index, document in enumerate(data_loader.get_documents(
        label = label
    )):
        print('%3d %7d   %s' %(
            index, len(document), get_document_preview(document)
        ))
        if index == 4:
            break

== pos ==
doc sentences start of first sentence
  0      25   films|adapted|from|comic|books|have|had|plenty|of|success|,|whether
  1      39   every|now|and|then|a|movie|comes|along|from|a|suspect|studio|,|with
  2      19   you've|got|mail|works|alot|better|than|it|deserves|to|.|in|order|to|make
  3      42   "|jaws|"|is|a|rare|film|that|grabs|your|attention|before|it|shows|you|a
  4      25   moviemaking|is|a|lot|like|being|the|general|manager|of|an|nfl|team|in
== neg ==
doc sentences start of first sentence
  0      35   plot|:|two|teen|couples|go|to|a|church|party|,|drink|and|then|drive|.
  1      13   the|happy|bastard's|quick|movie|review|damn|that|y2k|bug|.|it's|got|a
  2      23   it|is|movies|like|these|that|make|a|jaded|movie|viewer|thankful|for|the
  3      19   "|quest|for|camelot|"|is|warner|bros|.|'|first|feature-length|,
  4      37   synopsis|:|a|mentally|unstable|man|undergoing|psychotherapy|saves|a|boy


In [5]:
# test "get_xval_splits()"

splits = data_loader.get_xval_splits()

print('tr-size te-size (number of documents)')
for xval_tr_data, xval_te_data in splits:
    print('%7d %7d' %(len(xval_tr_data), len(xval_te_data)))

tr-size te-size (number of documents)
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200


In [6]:
class PolarityPredictorInterface:

    def train(self, data_with_labels):
        raise NotImplementedError
        
    def predict(self, data):
        raise NotImplementedError

In [7]:
class PolarityPredictorWithVocabulary(PolarityPredictorInterface):
    
    def train(self, data_with_labels):
        self.reset_vocab()
        self.add_to_vocab_from_data(data_with_labels)
        self.finalise_vocab()
        tr_features = self.extract_features(
            data_with_labels
        )
        tr_targets = self.get_targets(data_with_labels)
        self.train_model_on_features(tr_features, tr_targets)
        
    def reset_vocab(self):
        self.vocab = set()
        
    def add_to_vocab_from_data(self, data):
        for document, label in data:
            for sentence in document:
                for token in sentence:
                    self.vocab.add(token)

    def finalise_vocab(self):
        self.vocab = list(self.vocab)
        # create reverse map for fast token lookup
        self.token2index = {}
        for index, token in enumerate(self.vocab):
            self.token2index[token] = index
        
    def extract_features(self, data):
        raise NotImplementedError
    
    def get_targets(self, data, label2index = None):
        raise NotImplementedError
        
    def train_model_on_features(self, tr_features, tr_targets):
        raise NotImplementedError

In [8]:
import numpy

class PolarityPredictorWithBagOfWords_01(PolarityPredictorWithVocabulary):
    
    def __init__(self, clip_counts = True):
        self.clip_counts = clip_counts
        
    def extract_features(self, data):
        # create numpy array of required size
        columns = len(self.vocab)
        rows = len(data)
        features = numpy.zeros((rows, columns), dtype=numpy.int32)        
        # populate feature matrix
        for row, item in enumerate(data):
            document, _ = item
            for sentence in document:
                for token in sentence:
                    try:
                        index = self.token2index[token]
                    except KeyError:
                        # token not in vocab
                        # --> skip this token
                        # --> continue with next token
                        continue
                    if self.clip_counts:
                        features[row, index] = 1
                    else:
                        features[row, index] += 1
        return features

In [9]:
class PolarityPredictorWithBagOfWords(PolarityPredictorWithBagOfWords_01):
 
    def get_targets(self, data):
        ''' create column vector with target labels
        '''
        # prepare target vector
        targets = numpy.zeros(len(data), dtype=numpy.int8)
        index = 0
        for _, label in data:
            if label == 'pos':
                targets[index] = 1
            index += 1
        return targets

    def train_model_on_features(self, tr_features, tr_targets):
        raise NotImplementedError

## Naive Bayes

These next few cells set up the baseline Naive Bayes model that we were supplied with

In [134]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

class PolarityPredictorBowNB(PolarityPredictorWithBagOfWords):

    def train_model_on_features(self, tr_features, tr_targets):
        # pass numpy array to sklearn to train NB
        self.model = MultinomialNB()
        self.model.fit(tr_features, tr_targets)
        
    def predict(
        self, data, get_accuracy = False,
        get_confusion_matrix = False
    ):
        features = self.extract_features(data)
        # use numpy to get predictions
        y_pred = self.model.predict(features)
        # restore labels
        labels = []
        for i, is_positive in enumerate(y_pred):

            if is_positive:
                labels.append('pos')
            else:
                labels.append('neg')
        if get_accuracy or get_confusion_matrix:
            retval = []
            retval.append(labels)
            y_true = self.get_targets(data)
            if get_accuracy:
                retval.append(
                    metrics.accuracy_score(y_true, y_pred)
                )
            if get_confusion_matrix:
                retval.append(
                    metrics.confusion_matrix(y_true, y_pred)
                )
            return retval, y_true, y_pred
        else:
            return labels, y_pred

In [135]:
# first functionality test

model = PolarityPredictorBowNB()
model.train(splits[0][0]) 

In [136]:
predictions, y_true, pred = model.predict(splits[0][1], get_accuracy = True)

In [137]:
corrects = y_true==pred

In [138]:
incorrects_nb = []
for i, answer in enumerate(corrects):
    if answer == False:
        incorrects_nb.append(i)

In [139]:
incorrects_nb

[0,
 4,
 19,
 22,
 25,
 28,
 34,
 40,
 44,
 49,
 50,
 54,
 55,
 57,
 58,
 72,
 75,
 80,
 82,
 89,
 90,
 93,
 98,
 103,
 108,
 110,
 124,
 125,
 134,
 135,
 141,
 142,
 154,
 159,
 160,
 175,
 176,
 186,
 190,
 194,
 197]

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

class PolarityPredictorBowLR(PolarityPredictorWithBagOfWords):

    def train_model_on_features(self, tr_features, tr_targets):
        # pass numpy array to sklearn to train Logistic Regression
        # iterations set to 1000 as default of 100 didn't guarantee convergence with our data
        self.model = LogisticRegression(max_iter=1000)
        self.model.fit(tr_features, tr_targets)
        
    def predict(
        self, data, get_accuracy = False,
        get_confusion_matrix = False
    ):
        features = self.extract_features(data)
        # use numpy to get predictions
        y_pred = self.model.predict(features)
        # restore labels
        labels = []
        for is_positive in y_pred:
            if is_positive:
                labels.append('pos')
            else:
                labels.append('neg')
        if get_accuracy or get_confusion_matrix:
            retval = []
            retval.append(labels)
            y_true = self.get_targets(data)
            if get_accuracy:
                retval.append(
                    metrics.accuracy_score(y_true, y_pred)
                )
            if get_confusion_matrix:
                retval.append(
                    metrics.confusion_matrix(y_true, y_pred)
                )
            return retval
        else:
            return labels

In [140]:
model = PolarityPredictorBowLR()

In [87]:
bert_256_files = []

for i in range(1,11):
    bert_256_files.append(f'256_BERT/{i}_pred.txt')


bert_512_files = []

for i in range(1,11):
    bert_512_files.append(f'512_BERT/{i}_pred_512.txt')
    

bert_512_12_files = []
    
for i in range(1,11):
    bert_512_12_files.append(f'512_12_BERT/{i}_pred_512_12.txt')    

    
bert_256_neg_files = ['256_BERT_NEG/1_pred_negation.txt']

    
c_names = ['gold','pred','correct','text']

df1 = pd.DataFrame(columns=c_names)
df2 = pd.DataFrame(columns=c_names)
df3 = pd.DataFrame(columns=c_names)
df4 = pd.DataFrame(columns=c_names)
df5 = pd.DataFrame(columns=c_names)
df6 = pd.DataFrame(columns=c_names)
df7 = pd.DataFrame(columns=c_names)
df8 = pd.DataFrame(columns=c_names)
df9 = pd.DataFrame(columns=c_names)
df10 = pd.DataFrame(columns=c_names)

df1_512 = pd.DataFrame(columns=c_names)
df2_512 = pd.DataFrame(columns=c_names)
df3_512 = pd.DataFrame(columns=c_names)
df4_512 = pd.DataFrame(columns=c_names)
df5_512 = pd.DataFrame(columns=c_names)
df6_512 = pd.DataFrame(columns=c_names)
df7_512 = pd.DataFrame(columns=c_names)
df8_512 = pd.DataFrame(columns=c_names)
df9_512 = pd.DataFrame(columns=c_names)
df10_512 = pd.DataFrame(columns=c_names)

df1_512_12 = pd.DataFrame(columns=c_names)
df2_512_12 = pd.DataFrame(columns=c_names)
df3_512_12 = pd.DataFrame(columns=c_names)
df4_512_12 = pd.DataFrame(columns=c_names)
df5_512_12 = pd.DataFrame(columns=c_names)
df6_512_12 = pd.DataFrame(columns=c_names)
df7_512_12 = pd.DataFrame(columns=c_names)
df8_512_12 = pd.DataFrame(columns=c_names)
df9_512_12 = pd.DataFrame(columns=c_names)
df10_512_12 = pd.DataFrame(columns=c_names)

df1_neg = pd.DataFrame(columns=c_names)

dataframes_256 = [df1,df2,df3,df4,df5,df6,df7,df8,df9,df10]

dataframes_512 = [df1_512,df2_512,df3_512,df4_512,df5_512,df6_512,df7_512,df8_512,df9_512,df10_512]

dataframes_512_12 = [df1_512_12,df2_512_12,df3_512_12,df4_512_12,df5_512_12,df6_512_12,df7_512_12,df8_512_12,df9_512_12,df10_512_12]

dataframes_256_neg = [df1_neg]


def create_dfs(files, df_list):
    j = 0
    for dataframe in df_list:

        #dataframe = pd.DataFrame(columns=['index','gold','pred','correct','text'])
        processed_lines = []

        with open(files[j], 'r') as f:
            lines = f.readlines()

            count = 0
            for line in lines[1:]:
                tokens = line.split()
                line_length = len(tokens)
                temp_line = ''

                for i in range(4, (line_length)):
                    temp_line = temp_line + tokens[i] + ' '

                processed_line = [tokens[1],tokens[2],tokens[3], temp_line]
                processed_lines.append(processed_line)
                dataframe.loc[count] = processed_line
                count+=1
        j+=1
    return(df_list)

In [88]:
dataframes_256 = create_dfs(bert_256_files, dataframes_256)

dataframes_512 = create_dfs(bert_512_files, dataframes_512)

dataframes_512_12 = create_dfs(bert_512_12_files, dataframes_512_12)

dataframes_256_neg = create_dfs(bert_256_neg_files, dataframes_256_neg)

In [89]:
def get_f1(dataframe):
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    corrects = 0
    errors = []
    for i in range(0,len(dataframe)):
        if dataframe.iat[i,2] == 'yes':
            corrects += 1
        else:
            errors.append(i)
        if (dataframe.iat[i,0] == 'pos' and dataframe.iat[i,1] == 'pos'):
            true_pos += 1
        elif (dataframe.iat[i,0] == 'pos' and dataframe.iat[i,1] == 'neg'):
            false_neg += 1
        elif (dataframe.iat[i,0] == 'neg' and dataframe.iat[i,1] == 'neg'):
            true_neg += 1
        elif (dataframe.iat[i,0] == 'neg' and dataframe.iat[i,1] == 'pos'):
            false_pos += 1
    
    accuracy = corrects/len(dataframe)
    precision = true_pos/(true_pos + false_pos)
    recall = true_pos/(true_pos + false_neg)
    f1_score = 2*((precision*recall)/(precision + recall))
    return(accuracy,precision,recall,f1_score,errors)

In [90]:
def get_averages(df_list):
    accuracies = []
    precs = []
    recs = []
    f1s = []
    errors_list = []
    for dataframe in df_list:    
        true_pos = 0
        true_neg = 0
        false_pos = 0
        false_neg = 0
        corrects = 0
        errors = []
        for i in range(0,len(dataframe)):
            if dataframe.iat[i,2] == 'yes':
                corrects += 1
            else:
                errors.append(i)
            if (dataframe.iat[i,0] == 'pos' and dataframe.iat[i,1] == 'pos'):
                true_pos += 1
            elif (dataframe.iat[i,0] == 'pos' and dataframe.iat[i,1] == 'neg'):
                false_neg += 1
            elif (dataframe.iat[i,0] == 'neg' and dataframe.iat[i,1] == 'neg'):
                true_neg += 1
            elif (dataframe.iat[i,0] == 'neg' and dataframe.iat[i,1] == 'pos'):
                false_pos += 1

        accuracy = corrects/len(dataframe)
        accuracies.append(accuracy)
        
        precision = true_pos/(true_pos + false_pos)
        precs.append(precision)

        recall = true_pos/(true_pos + false_neg)
        recs.append(recall)
        
        f1_score = 2*((precision*recall)/(precision + recall))
        f1s.append(f1_score)
        
        errors_list.append(errors)
        
    return(sum(accuracies)/len(df_list),sum(precs)/len(df_list),sum(recs)/len(df_list),sum(f1s)/len(df_list), errors_list)

In [131]:
def print_averages_get_errors(dataframes, errorlist = False):
    acc,prec,rec,f1,errors = get_averages(dataframes)
    if errorlist == True:
        return(errors)
    else:
        for i, dataframe in enumerate(dataframes):
            scores = get_f1(dataframe)
            print(f'Cross validation {i+1}')
            print(f'The accuracy is {scores[0]*100:.2f}%')
            print(f'The precision is {scores[1]*100:.2f}%')
            print(f'The recall is {scores[2]*100:.2f}%')
            print(f'The F1 score is {scores[3]*100:.2f}%')
            print(f'The model got the following rows wrong {scores[4]}\n')

        print(f'The average accuracy is {acc*100:.2f}%')
        print(f'The average precision is {prec*100:.2f}%')
        print(f'The average recall is {rec*100:.2f}%')
        print(f'The average F1 score is {f1*100:.2f}%')

In [132]:
#error_256 = print_averages_get_errors(dataframes_256, True)
#error_512 = print_averages_get_errors(dataframes_512, True)
error_512_12 = print_averages_get_errors(dataframes_512_12, True)

In [133]:
error_512[0]

[1,
 10,
 24,
 25,
 40,
 44,
 50,
 56,
 82,
 83,
 91,
 106,
 118,
 134,
 135,
 136,
 157,
 159,
 171,
 173]

In [123]:
error_256[0]

[1,
 10,
 44,
 50,
 82,
 91,
 108,
 113,
 117,
 118,
 124,
 128,
 129,
 134,
 135,
 157,
 171,
 177,
 197,
 198]

In [125]:
for i in range(len(error_256[0])):
    if error_256[0][i] in error_512[0]:
        print(error_256[0][i])

1
10
44
50
82
91
118
134
135
157
171


In [128]:
for i in range(len(error_512_12[0])):
    if error_512_12[0][i] not in error_256[0]:
        print(error_512_12[0][i])

4
24
25


In [130]:
print_averages(dataframes_256)

Cross validation 1
The accuracy is 90.00%
The precision is 87.04%
The recall is 94.00%
The F1 score is 90.38%
The model got the following rows wrong [1, 10, 44, 50, 82, 91, 108, 113, 117, 118, 124, 128, 129, 134, 135, 157, 171, 177, 197, 198]

Cross validation 2
The accuracy is 87.50%
The precision is 85.71%
The recall is 90.00%
The F1 score is 87.80%
The model got the following rows wrong [9, 14, 18, 50, 59, 62, 65, 78, 93, 94, 104, 105, 115, 118, 125, 137, 140, 142, 143, 146, 162, 167, 170, 177, 189]

Cross validation 3
The accuracy is 95.00%
The precision is 93.27%
The recall is 97.00%
The F1 score is 95.10%
The model got the following rows wrong [8, 44, 99, 100, 133, 142, 156, 162, 178, 196]

Cross validation 4
The accuracy is 91.00%
The precision is 90.20%
The recall is 92.00%
The F1 score is 91.09%
The model got the following rows wrong [14, 31, 36, 54, 83, 92, 94, 98, 105, 108, 109, 120, 153, 159, 161, 168, 181, 185]

Cross validation 5
The accuracy is 85.00%
The precision is 88

In [104]:
print_averages(dataframes_512_12)

Cross validation 1
The accuracy is 93.50%
The precision is 95.79%
The recall is 91.00%
The F1 score is 93.33%
The model got the following rows wrong [1, 4, 10, 24, 25, 44, 50, 82, 91, 118, 135, 157, 171]

Cross validation 2
The accuracy is 92.50%
The precision is 92.08%
The recall is 93.00%
The F1 score is 92.54%
The model got the following rows wrong [9, 18, 59, 62, 88, 93, 94, 104, 115, 125, 137, 142, 167, 177, 189]

Cross validation 3
The accuracy is 93.00%
The precision is 90.57%
The recall is 96.00%
The F1 score is 93.20%
The model got the following rows wrong [8, 30, 50, 99, 100, 107, 111, 121, 133, 137, 142, 156, 162, 178]

Cross validation 4
The accuracy is 92.50%
The precision is 92.93%
The recall is 92.00%
The F1 score is 92.46%
The model got the following rows wrong [7, 14, 21, 34, 70, 83, 94, 98, 105, 109, 120, 128, 131, 159, 181]

Cross validation 5
The accuracy is 88.50%
The precision is 89.69%
The recall is 87.00%
The F1 score is 88.32%
The model got the following rows w

In [105]:
print_averages(dataframes_512)

Cross validation 1
The accuracy is 90.00%
The precision is 90.82%
The recall is 89.00%
The F1 score is 89.90%
The model got the following rows wrong [1, 10, 24, 25, 40, 44, 50, 56, 82, 83, 91, 106, 118, 134, 135, 136, 157, 159, 171, 173]

Cross validation 2
The accuracy is 93.50%
The precision is 91.43%
The recall is 96.00%
The F1 score is 93.66%
The model got the following rows wrong [9, 59, 88, 94, 104, 115, 125, 137, 142, 146, 167, 177, 189]

Cross validation 3
The accuracy is 92.00%
The precision is 89.62%
The recall is 95.00%
The F1 score is 92.23%
The model got the following rows wrong [8, 30, 50, 62, 99, 100, 107, 121, 128, 133, 142, 147, 156, 162, 173, 178]

Cross validation 4
The accuracy is 87.50%
The precision is 87.88%
The recall is 87.00%
The F1 score is 87.44%
The model got the following rows wrong [7, 14, 21, 27, 28, 30, 34, 39, 70, 83, 85, 94, 98, 108, 109, 120, 121, 128, 131, 133, 159, 163, 179, 181, 191]

Cross validation 5
The accuracy is 87.50%
The precision is 88.6

In [106]:
incorrects_bert = scores[4]

In [109]:
len(incorrects_bert), len(incorrects_nb)

(20, 41)

In [111]:
for i in range(len(incorrects_nb)):
    if incorrects_nb[i] in incorrects_bert:
        print(incorrects_nb[i])

44
50
82
108
124
134
135
197


In [119]:
df1.iloc[44]['text']

'. jock\'s airplane at the beginning of the film has the registration number " ob-cpo " . this is a reference to obi-wan and c-3po from george lucas\' star wars ( 9 . 5/10 ) . also , the hieroglyphics in the well of souls include engravings of r2-d2 and c-3po . they can be seen on a post to the right of indy and sallah as they remove the ark . the script originally included a long fight between a swordsman and indiana with his whip . as legend has it , actor harrison ford was suffering diarrhea at the time , and asked if the scene could be shortened . spielberg said the only way he could shorten it was if indy pulled out his gun and just shot the guy . the entire crew laughed and that\'s how it was filmed . when indy first falls in the well of souls and is face to face with the cobra , you can see the snake\'s reflection on the glass dividing it and harrison ford , also some fingerprints and stuff like that . when indy is dragging along the ground , hanging onto the nazi soldier\'s tru

In [123]:
splits[0][1][44]

([['what',
   'do',
   'you',
   'get',
   'when',
   'you',
   'slap',
   'together',
   'a',
   'movie',
   'based',
   'on',
   'a',
   'story',
   'by',
   'the',
   'legendary',
   'george',
   'lucas',
   ',',
   'directed',
   'by',
   'virtuoso',
   'director',
   'steven',
   'spielberg',
   ',',
   'and',
   'starring',
   'one',
   'of',
   'the',
   'biggest',
   'box-office',
   'stars',
   'in',
   'the',
   'world',
   ',',
   'harrison',
   'ford',
   '?'],
  ['you',
   'get',
   'one',
   "hot-fudge-rockin'",
   'good',
   'time',
   ',',
   "that's",
   'what',
   'you',
   'get',
   '!',
   '!',
   '!'],
  ['plot',
   ':',
   'professor/archeologist',
   'indiana',
   'jones',
   'sets',
   'out',
   'to',
   'find',
   'the',
   'long-lost',
   'mystical',
   'ark',
   'of',
   'the',
   'covenant',
   'before',
   'the',
   'nazis',
   'get',
   'their',
   'grubby',
   'fingers',
   'hands',
   'on',
   'it',
   '.'],
  ['adventures',
   ',',
   'snakes',
   ',',


In [126]:
df1.iloc[50]

gold                                                     pos
pred                                                     neg
correct                                                   no
text       , and boon is the comedian . he's got a steady...
Name: 50, dtype: object

In [127]:
df1.iloc[50]['text']

', and boon is the comedian . he\'s got a steady-date , katy ( karen allen ) , but she\'s sick of playing second-fiddle to a bottle of j . d . then there are the others : pinto , a wimp ; flounder , a blimp ; d . day , a biker ; stork , who may or may not have brain-damage ; and last but not least . . . bluto ! bluto , played by the late , great john belushi , is the man . he\'s the kind of guy who slugs back entire fifths of whiskey then proclaim , " i needed that . " the kind of guy who puts a cream-filled snowball into his mouth , puffs up his cheeks and spits it out , and then says " i\'m a zit -- get it ? " the story is as follows : the omegas are getting the deltas kicked off campus . the deltas , knowing that fighting the omegas is stupid , decide to go out with style , throwing a wild toga party and ruining the homecoming parade . this is the fucnniest movie int he history of the world . do yourself a favor and go see it . '

In [125]:
splits[0][1][50]

([['national',
   "lampoon's",
   'animal',
   'house',
   ',',
   'made',
   'in',
   '1978',
   'and',
   'set',
   'in',
   '1962',
   ',',
   'remains',
   'one',
   'of',
   'the',
   '--',
   'no',
   ',',
   'fuck',
   'that',
   'noise',
   '--',
   '*',
   'the',
   '*',
   'funniest',
   'movie',
   'ever',
   'made',
   '.'],
  ['and',
   'this',
   "isn't",
   'just',
   'my',
   'opinion',
   ',',
   'either',
   ';',
   'everybody',
   'knows',
   'this',
   ',',
   'and',
   "that's",
   'why',
   'about',
   'a',
   'gazillion',
   'inferior',
   'rip-offs',
   'have',
   'been',
   'made',
   ',',
   'trying',
   'to',
   'duplicate',
   'its',
   'success',
   '.'],
  ['(', 'pcu', 'anyone', '?'],
  ['and',
   'the',
   'first',
   'person',
   'to',
   'bring',
   'up',
   'glory',
   'daze',
   'gets',
   'decked',
   '.',
   ')'],
  ['animal',
   'house',
   'takes',
   'place',
   'at',
   'the',
   'fictional',
   'faber',
   'college',
   ',',
   'circa',
   '196

In [128]:
df1.iloc[82]

gold                                                     pos
pred                                                     neg
correct                                                   no
text       less music , and john landis seems to have los...
Name: 82, dtype: object

In [129]:
df1.iloc[82]['text']

'less music , and john landis seems to have lost interest in the whole thing . there\'s a few early crashes , and then one huge pile-up , but after that it all stops . it\'s just the music . one of my problems with the first is that cab calloway\'s song is so good the actually blues brothers look dull after him , but there\'s no problems with this . the music is all as good as ever , tons of great musicians showing up -- with the exception of johnny lang , who can\'t sing , all the musicians do a great job . the only real problems i had was the special effects . these were superfluous and a waste of money . since the film isn\'t doing very well , they could mean we have no possibility of another sequel , which i want to see . the bluegrass version of riders in the sky is even better than rawhide . -- http : //www . geocities . com/hollywood/academy/8034/ remove no spam to reply . " drive carefully but recklessly " , mama , child\'s toy " the only excercise i take is walking behind the 