The original dataset folder from https://ai.stanford.edu/~amaas/data/sentiment/ (found in the reviewdb folder) contained 2 folders: 
1. test
2. train
    Each folder had a positive review text file and a negative review text file with urls of all the comments (which contains the imdb movie id). 
    Each folder also had two folders:
    1. pos 
    2. neg
    Each of these folders had text files of the review text.
    Each of the files name was of the format "row number in url file"_"rating from 0-5".txt (eg: 200_4.txt)
    

Processing the files to get a file with the text review, 0-5 rating and movieid to which the comment belongs to

In [14]:
import pandas as pd
from collections import defaultdict
from pathlib import Path

test_pos_urls = pd.read_csv('reviewdb/test/urls_pos.txt', header = None)
test_neg_urls = pd.read_csv('reviewdb/test/urls_neg.txt', header = None)
train_pos_urls = pd.read_csv('reviewdb/train/urls_pos.txt', header = None)
train_neg_urls = pd.read_csv('reviewdb/train/urls_neg.txt', header = None)

Replacing the url with the movieid

In [15]:
for i in range(len(test_pos_urls)): #since they all have 12500 rows can do all simultaneously
    test_pos_urls.iloc[i,0] = str(test_pos_urls.iloc[i,0]).split('/')[4] #split by / and only take the movieid
    test_neg_urls.iloc[i,0] = str(test_neg_urls.iloc[i,0]).split('/')[4] #split by / and only take the movieid
    train_pos_urls.iloc[i,0] = str(train_pos_urls.iloc[i,0]).split('/')[4] #split by / and only take the movieid
    train_neg_urls.iloc[i,0] = str(train_neg_urls.iloc[i,0]).split('/')[4] #split by / and only take the movieid
test_pos_urls.head()

Unnamed: 0,0
0,tt0406816
1,tt0406816
2,tt0406816
3,tt0406816
4,tt0406816


# Processing the test files

Processing the positive test files

In [16]:
#Create dataframe for test_pos files with review, rating, and IMDb movieid
my_dir_path = "reviewdb/test/pos"
len_path = len(my_dir_path)
results = defaultdict(list)
#Taking each file to create a dictionary with reviews and then convert to a dataframe
for file in Path(my_dir_path).iterdir():
    with open(file, "r") as file_open:
        fname = str(file)[len_path+1:]
        results["file_name"] = str(file)[len_path+1:]
        results["text"].append(file_open.read())
df = pd.DataFrame(results)
#Taking each file name and using it to get the row, rating and fetching the movieid from the test_pos_urls file
i = 0
df['rating'] = df.apply(lambda _: '', axis=1)
df['imdb_id'] = df.apply(lambda _: '', axis=1)
for file in Path(my_dir_path).iterdir():
    fname = str(file)[len_path+1:-4].split('_') #.../4715_9.txt  take the string without the filepath and after .txt and spilt at _
    row = fname[0]                              #row in url file with the correnponding movie id
    rating = fname[1]                           #rating for this movie and review
    movieid = test_pos_urls.iloc[int(row),0]
    df.iloc[i,0] = movieid                      #substituting in the string imdb title
    df.iloc[i,2] = rating                       #substituting the rating from 0-5
    df.iloc[i,3] = int(movieid[2:])            #substituting in the integer imdb title without the prefix 'tt'
    i += 1
df.columns = ['imdb_title_id','review','rating', 'imdb_id']
df.head()

Unnamed: 0,imdb_title_id,review,rating,imdb_id
0,tt0112495,"Based on an actual story, John Boorman shows t...",9,112495
1,tt0460766,This is a gem. As a Film Four production - the...,9,460766
2,tt0238784,"I really like this show. It has drama, romance...",9,238784
3,tt0339384,This is the best 3-D experience Disney has at ...,10,339384
4,tt0762073,"Of the Korean movies I've seen, only three had...",10,762073


Processing the negative test files

In [17]:
#Create dataframe for test_neg files with review, rating, and IMDb movieid
my_dir_path = "reviewdb/test/neg"
len_path = len(my_dir_path)
results = defaultdict(list)
#Taking each file to create a dictionary with reviews and then convert to a dataframe
for file in Path(my_dir_path).iterdir():
    with open(file, "r") as file_open:
        fname = str(file)[len_path+1:]
        results["file_name"] = str(file)[len_path+1:]
        results["text"].append(file_open.read())
df2 = pd.DataFrame(results)
#Taking each file name and using it to get the row, rating and fetching the movieid from the test_neg_urls file
i = 0
df2['rating'] = df2.apply(lambda _: '', axis=1)
df2['imdb_id'] = df2.apply(lambda _: '', axis=1)
for file in Path(my_dir_path).iterdir():
    fname = str(file)[len_path+1:-4].split('_') #.../4715_9.txt  take the string without the filepath and after .txt and spilt at _
    row = fname[0]                              #row in url file with the correnponding movie id
    rating = fname[1]                           #rating for this movie and review
    movieid = test_neg_urls.iloc[int(row),0]
    df2.iloc[i,0] = movieid                      #substituting in the string imdb title
    df2.iloc[i,2] = rating                       #substituting the rating from 0-5
    df2.iloc[i,3] = int(movieid[2:])            #substituting in the integer imdb title without the prefix 'tt'
    i += 1
df2.columns = ['imdb_title_id','review','rating', 'imdb_id']
df2.head()

Unnamed: 0,imdb_title_id,review,rating,imdb_id
0,tt0138541,Alan Rickman & Emma Thompson give good perform...,4,138541
1,tt0202521,I have seen this movie and I did not care for ...,1,202521
2,tt0417658,"In Los Angeles, the alcoholic and lazy Hank Ch...",4,417658
3,tt0066105,"This film is bundled along with ""Gli fumavano ...",2,66105
4,tt0787505,I only comment on really very good films and o...,1,787505


# Processing the train files

Processing the positive train files

In [18]:
#Create dataframe for train_pos files with review, rating, and IMDb movieid
my_dir_path = "reviewdb/train/pos"
len_path = len(my_dir_path)
results = defaultdict(list)
#Taking each file to create a dictionary with reviews and then convert to a dataframe
for file in Path(my_dir_path).iterdir():
    with open(file, "r") as file_open:
        fname = str(file)[len_path+1:]
        results["file_name"] = str(file)[len_path+1:]
        results["text"].append(file_open.read())
df3 = pd.DataFrame(results)
#Taking each file name and using it to get the row, rating and fetching the movieid from the train_pos_urls file
i = 0
df3['rating'] = df3.apply(lambda _: '', axis=1)
df3['movieid_int'] = df3.apply(lambda _: '', axis=1)
for file in Path(my_dir_path).iterdir():
    fname = str(file)[len_path+1:-4].split('_') #.../4715_9.txt  take the string without the filepath and after .txt and spilt at _
    row = fname[0]                              #row in url file with the correnponding movie id
    rating = fname[1]                           #rating for this movie and review
    movieid = train_pos_urls.iloc[int(row),0]
    df3.iloc[i,0] = movieid                      #substituting in the string imdb title
    df3.iloc[i,2] = rating                       #substituting the rating from 0-5
    df3.iloc[i,3] = int(movieid[2:])            #substituting in the integer imdb title without the prefix 'tt'
    i += 1
df3.columns = ['imdb_title_id','review','rating', 'imdb_id']
df3.head()

Unnamed: 0,imdb_title_id,review,rating,imdb_id
0,tt0087507,For a movie that gets no respect there sure ar...,9,87507
1,tt0076683,Bizarre horror movie filled with famous faces ...,8,76683
2,tt0110099,"A solid, if unremarkable film. Matthau, as Ein...",7,110099
3,tt0462346,It's a strange feeling to sit alone in a theat...,8,462346
4,tt0801427,"You probably all already know this by now, but...",10,801427


Processing the negative train files

In [19]:
#Create dataframe for train_neg files with review, rating, and IMDb movieid
my_dir_path = "reviewdb/train/neg"
len_path = len(my_dir_path)
results = defaultdict(list)
#Taking each file to create a dictionary with reviews and then convert to a dataframe
for file in Path(my_dir_path).iterdir():
    with open(file, "r") as file_open:
        fname = str(file)[len_path+1:]
        results["file_name"] = str(file)[len_path+1:]
        results["text"].append(file_open.read())
df4 = pd.DataFrame(results)
#Taking each file name and using it to get the row, rating and fetching the movieid from the train_neg_urls file
i = 0
df4['rating'] = df4.apply(lambda _: '', axis=1)
df4['movieid_int'] = df4.apply(lambda _: '', axis=1)
for file in Path(my_dir_path).iterdir():
    fname = str(file)[len_path+1:-4].split('_') #.../4715_9.txt  take the string without the filepath and after .txt and spilt at _
    row = fname[0]                              #row in url file with the correnponding movie id
    rating = fname[1]                           #rating for this movie and review
    movieid = train_neg_urls.iloc[int(row),0]
    df4.iloc[i,0] = movieid                      #substituting in the string imdb title
    df4.iloc[i,2] = rating                       #substituting the rating from 0-5
    df4.iloc[i,3] = int(movieid[2:])            #substituting in the integer imdb title without the prefix 'tt'
    i += 1
df4.columns = ['imdb_title_id','review','rating', 'imdb_id']
df4.head()

Unnamed: 0,imdb_title_id,review,rating,imdb_id
0,tt0114057,Working with one of the best Shakespeare sourc...,4,114057
1,tt0334541,"Well...tremors I, the original started off in ...",1,334541
2,tt0337640,Ouch! This one was a bit painful to sit throug...,4,337640
3,tt0219400,"I've seen some crappy movies in my life, but t...",1,219400
4,tt0806203,"""Carriers"" follows the exploits of two guys an...",3,806203


# Concatenate all

In [21]:
#concatenate df3 and df4 
final_df = pd.concat([df,df2,df3,df4])
#rename movieid to imdb_title_id
final_df.columns = ['imdb_title_id','review','rating', 'imdb_id'] 
#MAKE RATING BETWEEN 0-5 
for i in range(len(final_df)):
    final_df.iloc[i,2] = int(final_df.iloc[i,2])/2 
final_df.head()

Unnamed: 0,imdb_title_id,review,rating,imdb_id
0,tt0112495,"Based on an actual story, John Boorman shows t...",4.5,112495
1,tt0460766,This is a gem. As a Film Four production - the...,4.5,460766
2,tt0238784,"I really like this show. It has drama, romance...",4.5,238784
3,tt0339384,This is the best 3-D experience Disney has at ...,5.0,339384
4,tt0762073,"Of the Korean movies I've seen, only three had...",5.0,762073


In [22]:
final_df.to_csv('finalDB/reviews.csv',index=False)