In [None]:
# English 286 Final Project
# Author: Jason Ngo
# Last Modified: December 8th
# Purpose: To identify the longevity of rank 1 novels on the NYT Bestseller List per 9 year segments

import pandas as pd
from statistics import mean

# Load the nyt dataset
nyt_df = pd.read_csv("nyt_full.tsv", sep = "\t")

# Create dataframes corresponding containing all rank 1
nyt_r1 = nyt_df[nyt_df["rank"] == 1]
nyt_r1 = nyt_r1.drop_duplicates(subset = ["title_id"], keep = "first")

# Convert week column to datetime for easy subtraction
nyt_r1["week"] = pd.to_datetime(nyt_r1["week"])
# Sort the columns so their indexes are easily accessible
nyt_r1 = nyt_r1.sort_values(by = "week", ignore_index = True)

## Split up the cleaned DF into 9 year segments
# segment 1 = 1931 - 1940
# segment 2 = 1941 - 1950
# segment 3 = 1951 - 1960
# segment 4 = 1961 - 1970
# segment 5 = 1971 - 1980
# segment 6 = 1981 - 1990
# segment 7 = 1991 - 2000
# segment 8 = 2001 - 2010
# segmemt 9 = 2011 - 2020

# Create arrays for the 9 different segments
seg1_diff = []
seg2_diff = []
seg3_diff = []
seg4_diff = []
seg5_diff = []
seg6_diff = []
seg7_diff = []
seg8_diff = []
seg9_diff = []

# Split up the rank 1 dataframe by the year segments, sorting it every time to reset the index values
nyt_r1_seg1 = nyt_r1.loc[nyt_r1["year"] < 1941]
nyt_r1_seg1 = nyt_r1_seg1.sort_values(by = "week", ignore_index = True)

nyt_r1_seg2 = nyt_r1.loc[(nyt_r1["year"] >= 1941) & (nyt_r1["year"] < 1951)]
nyt_r1_seg2 = nyt_r1_seg2.sort_values(by = "week", ignore_index = True)

nyt_r1_seg3 = nyt_r1.loc[(nyt_r1["year"] >= 1951) & (nyt_r1["year"] < 1961)]
nyt_r1_seg3 = nyt_r1_seg3.sort_values(by = "week", ignore_index = True)

nyt_r1_seg4 = nyt_r1.loc[(nyt_r1["year"] >= 1961) & (nyt_r1["year"] < 1971)]
nyt_r1_seg4 = nyt_r1_seg4.sort_values(by = "week", ignore_index = True)

nyt_r1_seg5 = nyt_r1.loc[(nyt_r1["year"] >= 1971) & (nyt_r1["year"] < 1981)]
nyt_r1_seg5 = nyt_r1_seg5.sort_values(by = "week", ignore_index = True)

nyt_r1_seg6 = nyt_r1.loc[(nyt_r1["year"] >= 1981) & (nyt_r1["year"] < 1991)]
nyt_r1_seg6 = nyt_r1_seg6.sort_values(by = "week", ignore_index = True)

nyt_r1_seg7 = nyt_r1.loc[(nyt_r1["year"] >= 1991) & (nyt_r1["year"] < 2001)]
nyt_r1_seg7 = nyt_r1_seg7.sort_values(by = "week", ignore_index = True)

nyt_r1_seg8 = nyt_r1.loc[(nyt_r1["year"] >= 2001) & (nyt_r1["year"] < 2011)]
nyt_r1_seg8 = nyt_r1_seg8.sort_values(by = "week", ignore_index = True)

nyt_r1_seg9 = nyt_r1.loc[(nyt_r1["year"] >= 2011) & (nyt_r1["year"] < 2021)]
nyt_r1_seg9 = nyt_r1_seg9.sort_values(by = "week", ignore_index = True)

# Calculating week[i] - week[i - 1] per year segment, and appending the results into their respective list
for i in range(1, len(nyt_r1_seg1)):
    seg1_diff.append(((nyt_r1_seg1["week"][i]) - (nyt_r1_seg1["week"][i - 1])).days)
print(len(seg1_diff))
print(mean(seg1_diff))

for i in range(1, len(nyt_r1_seg2)):
    seg2_diff.append(((nyt_r1_seg2["week"][i]) - (nyt_r1_seg2["week"][i - 1])).days)
print(len(seg2_diff))
print(mean(seg2_diff))

for i in range(1, len(nyt_r1_seg3)):
    seg3_diff.append(((nyt_r1_seg3["week"][i]) - (nyt_r1_seg3["week"][i - 1])).days)
print(len(seg3_diff))
print(mean(seg3_diff))

for i in range(1, len(nyt_r1_seg4)):
    seg4_diff.append(((nyt_r1_seg4["week"][i]) - (nyt_r1_seg4["week"][i - 1])).days)
print(len(seg4_diff))
print(mean(seg4_diff))

for i in range(1, len(nyt_r1_seg5)):
    seg5_diff.append(((nyt_r1_seg5["week"][i]) - (nyt_r1_seg5["week"][i - 1])).days)
print(len(seg5_diff))
print(mean(seg5_diff))

for i in range(1, len(nyt_r1_seg6)):
    seg6_diff.append(((nyt_r1_seg6["week"][i]) - (nyt_r1_seg6["week"][i - 1])).days)
print(len(seg6_diff))
print(mean(seg6_diff))

for i in range(1, len(nyt_r1_seg7)):
    seg7_diff.append(((nyt_r1_seg7["week"][i]) - (nyt_r1_seg7["week"][i - 1])).days)
print(len(seg7_diff))
print(mean(seg7_diff))

for i in range(1, len(nyt_r1_seg8)):
    seg8_diff.append(((nyt_r1_seg8["week"][i]) - (nyt_r1_seg8["week"][i - 1])).days)
print(len(seg8_diff))
print(mean(seg8_diff))

for i in range(1, len(nyt_r1_seg9)):
    seg9_diff.append(((nyt_r1_seg9["week"][i]) - (nyt_r1_seg9["week"][i - 1])).days)
print(len(seg9_diff))
print(mean(seg9_diff))

# Make a df for the final results
mean_longevity_df = pd.DataFrame({
    "Time Period": ["1931 - 1940", "1941 - 1950", "1951 - 1960", "1961 - 1970", "1971 - 1980", "1981 - 1990", "1991 - 2000", "2001 - 2010", "2011 - 2020"],
    "Entries": [len(seg1_diff), len(seg2_diff), len(seg3_diff), len(seg4_diff), len(seg5_diff), len(seg6_diff), len(seg7_diff), len(seg8_diff), len(seg9_diff)],
    "Mean Longevity": [mean(seg1_diff), mean(seg2_diff), mean(seg3_diff), mean(seg4_diff), mean(seg5_diff), mean(seg6_diff), mean(seg7_diff), mean(seg8_diff), mean(seg9_diff)]
  })

81
41.22222222222222
55
65.01818181818182
29
113.44827586206897
31
107.48387096774194
49
72.28571428571429
81
42.77777777777778
103
35
234
15.525641025641026
294
12.285714285714286


In [None]:
mean_longevity_df

Unnamed: 0,Time Period,Entries,Mean Longevity
0,1931 - 1940,81,41.222222
1,1941 - 1950,55,65.018182
2,1951 - 1960,29,113.448276
3,1961 - 1970,31,107.483871
4,1971 - 1980,49,72.285714
5,1981 - 1990,81,42.777778
6,1991 - 2000,103,35.0
7,2001 - 2010,234,15.525641
8,2011 - 2020,294,12.285714
