In [1]:
# Importing the necessary modules
import pandas as pd
import tarfile
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('yelp_review_full_csv/train.csv',  header=None)


In [3]:
train_df.head()

Unnamed: 0,0,1
0,5,dr. goldberg offers everything i look for in a...
1,2,"Unfortunately, the frustration of being Dr. Go..."
2,4,Been going to Dr. Goldberg for over 10 years. ...
3,4,Got a letter in the mail last week that said D...
4,1,I don't know what Dr. Goldberg was like before...


In [4]:
test_df = pd.read_csv('yelp_review_full_csv/test.csv',  header=None)


In [5]:
train_df[0] = (train_df[0] == 2).astype(int)
test_df[0] = (test_df[0] == 2).astype(int)

In [6]:
train_df.head()


Unnamed: 0,0,1
0,0,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,0,Been going to Dr. Goldberg for over 10 years. ...
3,0,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


Making things BERT friendly
First let's make the data compliant with BERT:

Column 0: An ID for the row. (Required both for train and test data.)
Column 1: The class label for the row. (Required only for train data.)
Column 2: A column of the same letter for all rows — this is a throw-away column that we need to include because BERT expects it. (Required only for train data.)
Column 3: The text examples we want to classify. (Required both for train and test data.)

We need to split the files into the format expected by BERT: BERT comes with data loading classes that expects two files called train and dev for training. In addition, BERT’s data loading classes can also use a test file but it expects the test file to be unlabelled.


Once the data is in the correct format, we need to save the files as .tsv (BERT doesn't take .csv as input.)

In [7]:
# Creating training dataframe according to BERT by adding the required columns
df_bert = pd.DataFrame({
    'id':range(len(train_df)),
    'label':train_df[0],
    'alpha':['a']*train_df.shape[0],
    'text': train_df[1].replace(r'\n', ' ', regex=True)
})

In [8]:
# Splitting training data file into *train* and *dev*
df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.01)

df_bert_train.head()

Unnamed: 0,id,label,alpha,text
641263,641263,0,a,The waitress was nice. It was very quiet as th...
72400,72400,1,a,Friendly nurses and clean hospital but doctors...
594049,594049,0,a,I've been on a hiatus from reviews on Yelp for...
173094,173094,0,a,"Congratulations Thomas Keller. My meal, actua..."
539454,539454,1,a,1st time there. I didn't appreciate being rush...


In [9]:
# Creating test dataframe according to BERT
df_bert_test = pd.DataFrame({
    'id':range(len(test_df)),
    'text': test_df[1].replace(r'\n', ' ', regex=True)
})

df_bert_test.head()

Unnamed: 0,id,text
0,0,I got 'new' tires from them and within two wee...
1,1,Don't waste your time. We had two different p...
2,2,All I can say is the worst! We were the only 2...
3,3,I have been to this restaurant twice and was d...
4,4,Food was NOT GOOD at all! My husband & I ate h...


In [11]:
# Saving dataframes to .tsv format as required by BERT
df_bert_train.to_csv('train.tsv', sep='\t', index=False, header=False)
df_bert_dev.to_csv('dev.tsv', sep='\t', index=False, header=False)
df_bert_test.to_csv('test.tsv', sep='\t', index=False, header=False)