# Processing Data for Course

In [1]:
import pandas as pd
import gzip
import json

In [2]:
def parse_gzip_json(path):
  '''
  helper func for get_json_to_df
  '''
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def get_json_to_df(path):
  i = 0
  df = {}
  for d in parse_gzip_json(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

## Jeopardy Questions

In [2]:
df_jeop_ques = pd.read_csv('./data/04-textual-analysis/jeopardy/Jeopardy-Questions.csv')
df_jeop_ques.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Show Number  216930 non-null  int64 
 1   AirDate      216930 non-null  object
 2   Round        216930 non-null  object
 3   Category     216930 non-null  object
 4   Value        213296 non-null  object
 5   Question     216930 non-null  object
 6   Answer       216927 non-null  object
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


In [3]:
def convert_to_int(v):
    if type(v) != float:
        # remove $ at start and commas
        new_value = float(v[1:].replace(',', ''))
        return new_value
    # no wager / NaN
    elif type(v) == float:
        new_value = 0
        return new_value

In [4]:
df_jeop_ques['IntValue'] = df_jeop_ques['Value'].apply(convert_to_int)
df_jeop_ques.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Show Number  216930 non-null  int64  
 1   AirDate      216930 non-null  object 
 2   Round        216930 non-null  object 
 3   Category     216930 non-null  object 
 4   Value        213296 non-null  object 
 5   Question     216930 non-null  object 
 6   Answer       216927 non-null  object 
 7   IntValue     216930 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 13.2+ MB


In [5]:
df_jeop_ques_sorted = df_jeop_ques.sort_values(by=['IntValue'], ascending=True)
df_jeop_ques_sorted.head()

Unnamed: 0,Show Number,AirDate,Round,Category,Value,Question,Answer,IntValue
216929,4999,2006-05-11,Final Jeopardy!,HISTORIC NAMES,,A silent movie title includes the last name of...,Grigori Alexandrovich Potemkin,0.0
146550,5095,2006-11-03,Final Jeopardy!,THE OSCARS,,He's the only person over 70 years of age to w...,Clint Eastwood,0.0
146611,5364,2007-12-27,Final Jeopardy!,AUTHORS' OBITUARIES,,"In 1991 the N.Y. Times said English was ""too s...",Dr. Seuss,0.0
146671,4336,2003-06-09,Final Jeopardy!,TIME'S MAN OF THE YEAR,,The only man named Time's Man of the Year 2 st...,Richard M. Nixon,0.0
146730,370,1986-02-07,Final Jeopardy!,ROYALTY,,"Just as wife of U.S. President is called ""Firs...",Prince Consort,0.0


In [6]:
print(
    'min',df_jeop_ques_sorted['IntValue'].min(),
    '\nmode',df_jeop_ques_sorted['IntValue'].mode(),
    '\nmean',df_jeop_ques_sorted['IntValue'].mean(),
    '\nmedian',df_jeop_ques_sorted['IntValue'].median(),
    '\nmax',df_jeop_ques_sorted['IntValue'].max()
)

min 0.0 
mode 0    400.0
Name: IntValue, dtype: float64 
mean 739.9884755451067 
median 600.0 
max 18000.0


In [15]:
q_less_than_400 = df_jeop_ques_sorted.loc[(df_jeop_ques_sorted['IntValue'] <= 400)].sample(10000)
q_greater_than_400_less_than_1000 = df_jeop_ques_sorted.loc[(df_jeop_ques_sorted['IntValue'] > 400) & (df_jeop_ques_sorted['IntValue'] <= 1000)].sample(10000)
q_greater_than_1000 = df_jeop_ques_sorted.loc[(df_jeop_ques_sorted['IntValue'] > 1000)].sample(10000)

print(
    'q_t1', len(q_less_than_400),
    '\nq_t2', len(q_greater_than_400_less_than_1000),
    '\nq_t3', len(q_greater_than_1000)
)

q_t1 10000 
q_t2 10000 
q_t3 10000


In [16]:
q_less_than_400['Question'].to_csv('./data/04-textual-analysis/jeopardy/split/q_less_than_400.txt', index=False, header=False)
q_greater_than_400_less_than_1000['Question'].to_csv('./data/04-textual-analysis/jeopardy/split/q_greater_than_400_less_than_1000.txt', index=False, header=False)
q_greater_than_1000['Question'].to_csv('./data/04-textual-analysis/jeopardy/split/q_greater_than_1000.txt', index=False, header=False)

## Amazon Reviews

Process Amazon reviews for Text Analysis. [OG Source](https://nijianmo.github.io/amazon/index.html#complete-data)
> Justifying recommendations using distantly-labeled reviews and fined-grained aspects. Jianmo Ni, Jiacheng Li, Julian McAuley. Empirical Methods in Natural Language Processing (EMNLP), 2019.

In [7]:
# Video Games. Have since deleted data due to its identifiers
df_amazon_video_games = get_json_to_df('./data/04-textual-analysis/amazon/Video_Games.json.gz')
df_amazon_video_games.columns

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'style', 'image'],
      dtype='object')

In [8]:
df_amazon_video_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2565349 entries, 0 to 2565348
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   overall         float64
 1   verified        bool   
 2   reviewTime      object 
 3   reviewerID      object 
 4   asin            object 
 5   reviewerName    object 
 6   reviewText      object 
 7   summary         object 
 8   unixReviewTime  int64  
 9   vote            object 
 10  style           object 
 11  image           object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 237.3+ MB


In [29]:
df_amazon_video_games_scores = df_amazon_video_games.dropna(subset=['overall','reviewText'])
# 2565348
df_amazon_video_games_scores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2563634 entries, 0 to 2565348
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   overall         float64
 1   verified        bool   
 2   reviewTime      object 
 3   reviewerID      object 
 4   asin            object 
 5   reviewerName    object 
 6   reviewText      object 
 7   summary         object 
 8   unixReviewTime  int64  
 9   vote            object 
 10  style           object 
 11  image           object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 237.2+ MB


In [17]:
vg_1 = df_amazon_video_games_scores.loc[(df_amazon_video_games_scores['overall'] == 1.0)]
vg_2 = df_amazon_video_games_scores.loc[(df_amazon_video_games_scores['overall'] == 2.0)]
vg_3 = df_amazon_video_games_scores.loc[(df_amazon_video_games_scores['overall'] == 3.0)]
vg_4 = df_amazon_video_games_scores.loc[(df_amazon_video_games_scores['overall'] == 4.0)]
vg_5 = df_amazon_video_games_scores.loc[(df_amazon_video_games_scores['overall'] == 5.0)]

In [30]:
print(
    '1',len(vg_1),
    '2',len(vg_2),
    '3',len(vg_3),
    '4',len(vg_4),
    '5',len(vg_5)
)

1 311891 2 141333 3 212346 4 412413 5 1487366


In [31]:
vg_1_txt = vg_1.reviewText
vg_2_txt = vg_2.reviewText
vg_3_txt = vg_3.reviewText
vg_4_txt = vg_4.reviewText
vg_5_txt = vg_5.reviewText

In [37]:
vg_3_txt.head()

1     The game itself worked great but the story lin...
6     Would like it more if they had at least update...
21    Just like I remember from childhood. But. You ...
31    We were hoping for something more like the ori...
33                                                   ok
Name: reviewText, dtype: object

In [44]:
vg_1_txt_1 = vg_1_txt.loc[(vg_1_txt != '') & (vg_1_txt.notna()) & (vg_1_txt.notnull())]
vg_2_txt_1 = vg_2_txt.loc[(vg_2_txt != '') & (vg_2_txt.notna()) & (vg_2_txt.notnull())]
vg_3_txt_1 = vg_3_txt.loc[(vg_3_txt != '') & (vg_3_txt.notna()) & (vg_3_txt.notnull())]
vg_4_txt_1 = vg_4_txt.loc[(vg_4_txt != '') & (vg_4_txt.notna()) & (vg_4_txt.notnull())]
vg_5_txt_1 = vg_5_txt.loc[(vg_5_txt != '') & (vg_5_txt.notna()) & (vg_5_txt.notnull())]

In [45]:
print(
    '1',len(vg_1_txt_1),
    '2',len(vg_2_txt_1),
    '3',len(vg_3_txt_1),
    '4',len(vg_4_txt_1),
    '5',len(vg_5_txt_1)
)

1 311808 2 141306 3 212302 4 412278 5 1485940


In [46]:
vg_1_txt_1.to_csv('./data/04-textual-analysis/amazon-reviews/video_game_reviews_1_ratings.txt', index=False, header=False)
vg_2_txt_1.to_csv('./data/04-textual-analysis/amazon-reviews/video_game_reviews_2_ratings.txt', index=False, header=False)
vg_3_txt_1.to_csv('./data/04-textual-analysis/amazon-reviews/video_game_reviews_3_ratings.txt', index=False, header=False)
vg_4_txt_1.to_csv('./data/04-textual-analysis/amazon-reviews/video_game_reviews_4_ratings.txt', index=False, header=False)
vg_5_txt_1.to_csv('./data/04-textual-analysis/amazon-reviews/video_game_reviews_5_ratings.txt', index=False, header=False)
