In [1]:
import pandas as pd
import numpy as np
import ast
from pandarallel import pandarallel
import datetime
from tqdm import tqdm
from difflib import SequenceMatcher

tqdm.pandas()
pandarallel.initialize()
pd.set_option('display.max_columns', None)

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
df = pd.read_pickle('stemmed.pkl')

In [3]:
df['publishTime'] =  pd.to_datetime(df['publishTime'])

In [4]:
df['year'] = df.publishTime.apply(lambda x: x.isocalendar()[0])
df['week'] = df.publishTime.apply(lambda x: x.isocalendar()[1])

In [5]:
df['weekAll'] = df.publishTime.apply(lambda x: (x.isocalendar()[0]-2017)*53+x.isocalendar()[1])

In [6]:
freqsText = {}
for i, speech in tqdm(df.iterrows(), total=len(df)):
    week = speech['weekAll']
    for token in speech['stemsText']:
        if token not in freqsText:
            freqsText[token] = {"total_freq":1, week:1}
        else:
            freqsText[token]["total_freq"] += 1
            if not freqsText[token].get(week):
                freqsText[token][week] = 1
            else:
                freqsText[token][week] += 1

100%|████████████████████████████████████| 11552/11552 [00:15<00:00, 751.10it/s]


In [7]:
freqsDes = {}
for i, speech in tqdm(df.iterrows(), total=len(df)):
    week = speech['weekAll']
    for token in speech['stemsDes']:
        if token not in freqsDes:
            freqsDes[token] = {"total_freq":1, week:1}
        else:
            freqsDes[token]["total_freq"] += 1
            if not freqsDes[token].get(week):
                freqsDes[token][week] = 1
            else:
                freqsDes[token][week] += 1

100%|███████████████████████████████████| 11552/11552 [00:02<00:00, 4216.11it/s]


In [8]:
freqsText = pd.DataFrame.from_dict(freqsText, orient='index')
freqsText['word'] = freqsText.index

freqsDes = pd.DataFrame.from_dict(freqsDes, orient='index')
freqsDes['word'] = freqsDes.index

In [9]:
new_cols = ["total_freq", "word"] + sorted(freqsText.columns.tolist()[1:-1])
freqsText = freqsText[new_cols]
freqsDes = freqsDes[new_cols]

freqsText = freqsText.sort_values('total_freq', ascending=False)
freqsDes = freqsDes.sort_values('total_freq', ascending=False)

freqsText.head()

Unnamed: 0,total_freq,word,45,46,47,48,49,50,51,52,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285
go,163777,go,106.0,1681.0,2237.0,1861.0,2433.0,1953.0,2184.0,2201.0,2578.0,1889.0,1232.0,2123.0,1382.0,1578.0,1678.0,1497.0,1491.0,1642.0,1523.0,1839.0,1489.0,2144.0,1149.0,1518.0,1436.0,1368.0,2240.0,1951.0,2219.0,2844.0,2510.0,492.0,193.0,825.0,906.0,1026.0,849.0,1352.0,1044.0,1427.0,1174.0,1011.0,981.0,1703.0,1514.0,1271.0,916.0,1309.0,1068.0,1293.0,1299.0,1625.0,1624.0,908.0,936.0,896.0,1413.0,1393.0,1420.0,1351.0,1293.0,1150.0,1367.0,1616.0,1067.0,1129.0,1095.0,1075.0,1352.0,1460.0,1320.0,1388.0,1339.0,1125.0,1290.0,1015.0,1040.0,1210.0,1101.0,1167.0,1171.0,1032.0,1182.0,1364.0,783.0,1213.0,985.0,960.0,900.0,1167.0,1225.0,1032.0,1208.0,959.0,1240.0,1098.0,1771.0,1264.0,1067.0,1139.0,1093.0,1319.0,1294.0,1015.0,1098.0,1230.0,813.0,932.0,845.0,882.0,877.0,842.0,1177.0,1216.0,958.0,1240.0,1041.0,1423.0,1085.0,863.0,1175.0,1028.0,1417.0,1137.0,1196.0,27.0
like,100000,like,52.0,965.0,1243.0,1010.0,1176.0,1154.0,1213.0,1268.0,1591.0,1149.0,806.0,1271.0,713.0,981.0,1090.0,905.0,926.0,929.0,992.0,1050.0,984.0,996.0,725.0,875.0,794.0,814.0,1402.0,1221.0,1351.0,1613.0,1519.0,212.0,106.0,567.0,564.0,567.0,482.0,763.0,637.0,805.0,664.0,679.0,574.0,1010.0,913.0,728.0,588.0,790.0,663.0,734.0,870.0,896.0,921.0,585.0,592.0,612.0,775.0,881.0,917.0,921.0,799.0,677.0,900.0,994.0,681.0,745.0,852.0,738.0,732.0,886.0,830.0,862.0,717.0,787.0,669.0,544.0,731.0,692.0,745.0,742.0,591.0,682.0,731.0,939.0,528.0,745.0,684.0,600.0,581.0,794.0,823.0,751.0,666.0,634.0,612.0,804.0,920.0,807.0,586.0,699.0,768.0,753.0,893.0,670.0,638.0,707.0,570.0,626.0,679.0,629.0,640.0,587.0,873.0,675.0,650.0,758.0,666.0,978.0,695.0,608.0,694.0,671.0,829.0,675.0,755.0,19.0
good,83597,good,47.0,835.0,1040.0,973.0,1053.0,940.0,1146.0,1186.0,1375.0,1054.0,614.0,991.0,718.0,935.0,958.0,811.0,839.0,879.0,863.0,961.0,843.0,940.0,730.0,912.0,797.0,655.0,1290.0,994.0,1093.0,1404.0,1278.0,245.0,115.0,457.0,459.0,451.0,386.0,641.0,443.0,622.0,578.0,457.0,500.0,711.0,755.0,610.0,463.0,652.0,478.0,611.0,614.0,721.0,776.0,504.0,441.0,469.0,636.0,680.0,605.0,697.0,678.0,583.0,690.0,793.0,597.0,539.0,604.0,606.0,670.0,621.0,715.0,677.0,691.0,672.0,558.0,522.0,661.0,621.0,550.0,537.0,579.0,557.0,574.0,688.0,448.0,640.0,577.0,543.0,514.0,601.0,661.0,534.0,638.0,467.0,546.0,676.0,861.0,603.0,516.0,640.0,583.0,748.0,630.0,551.0,623.0,724.0,349.0,462.0,561.0,440.0,444.0,441.0,688.0,620.0,508.0,599.0,491.0,670.0,499.0,454.0,652.0,518.0,759.0,532.0,666.0,6.0
littl,79815,littl,76.0,897.0,1055.0,829.0,1063.0,975.0,977.0,1032.0,1142.0,1005.0,626.0,994.0,613.0,767.0,736.0,712.0,661.0,620.0,737.0,783.0,707.0,720.0,554.0,659.0,658.0,820.0,888.0,734.0,871.0,901.0,901.0,141.0,90.0,398.0,371.0,452.0,412.0,639.0,544.0,554.0,519.0,675.0,516.0,846.0,907.0,588.0,530.0,561.0,536.0,748.0,659.0,836.0,875.0,482.0,564.0,441.0,572.0,685.0,673.0,567.0,664.0,528.0,767.0,844.0,536.0,610.0,630.0,614.0,672.0,710.0,654.0,788.0,557.0,630.0,540.0,451.0,685.0,588.0,609.0,512.0,562.0,589.0,561.0,799.0,423.0,635.0,579.0,616.0,543.0,648.0,660.0,539.0,563.0,500.0,524.0,639.0,813.0,606.0,466.0,558.0,556.0,605.0,636.0,502.0,531.0,507.0,447.0,516.0,484.0,514.0,456.0,514.0,707.0,541.0,517.0,642.0,610.0,782.0,544.0,483.0,645.0,529.0,717.0,638.0,680.0,6.0
one,74764,one,40.0,616.0,789.0,670.0,781.0,758.0,825.0,848.0,895.0,769.0,488.0,758.0,535.0,549.0,646.0,574.0,579.0,611.0,668.0,793.0,707.0,662.0,479.0,689.0,549.0,473.0,817.0,674.0,848.0,893.0,845.0,112.0,93.0,427.0,526.0,491.0,407.0,644.0,547.0,568.0,553.0,504.0,465.0,749.0,743.0,578.0,542.0,590.0,508.0,597.0,614.0,643.0,635.0,426.0,499.0,473.0,600.0,588.0,683.0,702.0,696.0,611.0,584.0,723.0,594.0,562.0,624.0,582.0,632.0,653.0,584.0,627.0,604.0,582.0,515.0,515.0,502.0,561.0,530.0,553.0,527.0,513.0,558.0,747.0,434.0,544.0,511.0,495.0,455.0,576.0,602.0,567.0,593.0,526.0,506.0,596.0,749.0,649.0,516.0,505.0,630.0,583.0,609.0,568.0,586.0,640.0,502.0,531.0,490.0,532.0,546.0,500.0,643.0,570.0,512.0,594.0,571.0,715.0,652.0,679.0,672.0,812.0,794.0,755.0,733.0,7.0


In [10]:
freqsText.to_pickle(f'textAll.pkl')
freqsDes.to_pickle(f'desAll.pkl')

In [11]:
!ls

addingEnglish.ipynb  des3.0.pkl		scripted.zip	text3.0.pkl
addScript.ipynb      des4.0.pkl		script.sh	text4.0.pkl
addScript.py	     des5.0.pkl		split.ipynb	text5.0.pkl
channel.csv	     des6.0.pkl		stemmed.csv	text6.0.pkl
channel_details.csv  des7.0.pkl		stemmed.pkl	text7.0.pkl
check.ipynb	     des8.0.pkl		stemming.ipynb	text8.0.pkl
data.csv	     des9.0.pkl		stems.txt	text9.0.pkl
des0.0.pkl	     desAll.pkl		text0.0.pkl	textAll.pkl
des10.0.pkl	     des.pkl		text10.0.pkl	thumbnail.ipynb
des1.0.pkl	     english.csv	text1.0.pkl	trendingWords.ipynb
des11.0.pkl	     frequencies.ipynb	text11.0.pkl	trends.ipynb
des12.0.pkl	     images		text12.0.pkl	untitled.txt
des13.0.pkl	     nohup.out		text13.0.pkl	videoIds
des14.0.pkl	     regroup.ipynb	text14.0.pkl	viz.ipynb
des2.0.pkl	     scripted		text2.0.pkl	words.txt
