# convert pleco dump to spreadsheet

In [1]:
#import xml file
import xml.etree.ElementTree as ET
import csv
tree = ET.parse('./Report.xml')
root = tree.getroot()
card_data = []
for card in root.findall(".//card"):
    card_datum = {}
    card_datum["created_stamp"] = card.get("created")
    card_datum["modified_stamp"] = card.get("modified")
    
    entry = card.find("./entry")
    card_datum["traditional"] = entry.find("./headword[@charset='tc']").text

    if entry.find("./cantopron") is None:
        continue
    card_datum["jyutping"] = entry.find("./cantopron").text
    
    if card.find("scoreinfo") is None:
        continue
    card_datum["correct"] = int(card.find("scoreinfo").get("correct"))
    card_datum["incorrect"] = int(card.find("scoreinfo").get("incorrect"))
    card_datum["difficulty"] = int(card.find("scoreinfo").get("difficulty"))
    card_datum["priority"] = card.find("scoreinfo").get("priority")
    card_datum["last_reviewed_stamp"] = int(card.find("scoreinfo").get("lastreviewedtime"))
    card_data.append(card_datum)
    
    

    
    
#convert xml to csv
import pandas as pd
cards_frame = pd.DataFrame(card_data)
for stamp in ["last_reviewed_stamp","created_stamp","modified_stamp"]:
    cards_frame[stamp] = pd.to_datetime(cards_frame[stamp],unit='s')
    
cards_frame.sort_values("created_stamp", ascending=False,inplace=True)
cards_frame.to_csv('./Report.csv', index = False)


# View 30 Most Recently Added Vocabs

In [2]:
#30 most recently added
cards_frame.sort_values("created_stamp", ascending=False).head(30)

Unnamed: 0,created_stamp,modified_stamp,traditional,jyutping,correct,incorrect,difficulty,priority,last_reviewed_stamp
941,2021-10-19 20:35:07,2021-10-20 02:59:34,身體,san1 tai2,1,3,90,,2021-10-27 14:57:10
940,2021-10-19 20:34:28,2021-10-20 02:59:34,乾淨,gon1 zeng6,0,4,90,,2021-10-27 14:58:53
939,2021-10-19 20:33:34,2021-10-20 02:59:34,廢氣,fai3 hei3,1,2,90,,2021-10-27 14:58:38
938,2021-10-19 20:33:19,2021-10-20 02:59:34,運作,wan6 zok3,0,4,90,,2021-10-27 14:56:49
937,2021-10-19 20:32:51,2021-10-20 02:59:34,要求,jiu1 kau4,1,2,90,,2021-10-25 19:35:02
936,2021-10-19 20:31:58,2021-10-20 02:59:34,突然,dat6 jin4,0,4,90,,2021-10-27 14:57:36
935,2021-10-19 20:31:12,2021-10-20 02:59:34,污染,wu1 jim5,1,2,90,,2021-10-27 14:58:46
934,2021-10-19 20:30:27,2021-10-20 02:59:34,空氣,hung1 hei3,1,2,90,,2021-10-25 19:36:17
933,2021-10-19 20:30:02,2021-10-20 02:59:34,找數,zaau2 sou3,0,4,90,,2021-10-27 14:57:03
932,2021-10-05 20:18:22,2021-10-05 20:20:41,從來,cung4 loi4,1,4,90,,2021-10-27 14:57:16


# View 30 Most Recently Reviewed Vocabs

In [3]:
#30 most recently reviewed
cards_frame.sort_values("last_reviewed_stamp", ascending=False).head(30)

Unnamed: 0,created_stamp,modified_stamp,traditional,jyutping,correct,incorrect,difficulty,priority,last_reviewed_stamp
545,2020-11-04 00:59:50,2021-07-23 16:14:07,睇吓,tai2 haa3,6,8,80,veryhigh,2021-10-27 15:07:32
586,2020-11-18 04:17:05,2021-07-23 16:14:07,嗰度,go2 dou6,6,6,70,veryhigh,2021-10-27 15:07:26
506,2020-10-27 15:58:29,2021-07-23 16:14:07,禮拜,lai5 baai3,5,8,70,veryhigh,2021-10-27 15:07:16
516,2020-10-27 22:37:56,2021-07-23 16:14:07,幾點,gei2 dim2,5,3,70,veryhigh,2021-10-27 15:02:11
742,2021-03-09 19:23:48,2021-07-30 18:24:45,新冠,san1 gun1,5,4,68,veryhigh,2021-10-27 15:01:51
12,2020-01-12 02:49:41,2021-07-23 16:14:07,朋友,pang4 jau5,24,10,70,veryhigh,2021-10-27 15:01:44
805,2021-04-26 20:39:56,2021-07-23 16:14:07,跟住,gan1 zyu6,7,13,60,veryhigh,2021-10-27 15:01:32
598,2020-11-25 14:53:10,2021-07-23 16:14:07,唔得,m4 dak1,3,2,80,veryhigh,2021-10-27 15:01:21
427,2020-07-29 19:22:57,2021-07-23 16:14:07,屋企,uk1 kei2,16,12,50,veryhigh,2021-10-27 15:01:13
928,2021-10-05 20:02:46,2021-10-05 20:20:41,攞,lo2,3,1,90,,2021-10-27 14:58:57


# View 30 Random Vocabs

In [4]:
#30 random words
cards_frame.sample(frac=1).head(30)

Unnamed: 0,created_stamp,modified_stamp,traditional,jyutping,correct,incorrect,difficulty,priority,last_reviewed_stamp
124,2020-04-05 02:06:50,2021-07-23 16:14:07,茶,caa4,11,3,112,,2020-09-29 17:13:05
582,2020-11-18 04:10:09,2021-09-27 23:14:11,單身,daan1 san1,1,0,100,,2021-09-27 23:14:36
137,2020-04-05 02:33:50,2021-07-23 16:14:07,麵包,min6 baau1,10,11,72,,2021-06-10 20:47:39
288,2020-05-16 02:15:41,2021-07-23 16:14:07,黑人,Hak1 jan4,8,1,102,,2020-11-30 18:58:25
643,2021-01-06 20:00:09,2021-07-23 16:14:07,常常,soeng4 soeng4,1,2,80,,2021-05-27 20:17:05
247,2020-05-04 15:20:42,2021-07-23 16:14:07,奶茶,naai5 caa4,13,10,50,,2021-04-10 17:28:36
492,2020-09-30 16:15:44,2021-08-03 12:55:06,中秋節,zung1 cau1 zit3,1,1,90,,2021-08-05 15:40:54
609,2020-12-09 17:28:27,2021-07-23 16:14:07,檢查,gim2 caa4,2,7,80,,2021-03-16 16:25:40
170,2020-04-11 03:16:01,2021-07-23 16:14:07,老鼠,lou5 syu2,7,17,58,,2021-02-07 19:54:53
831,2021-05-10 17:47:13,2021-07-23 16:14:07,雞同鴨講,gai1 tung4 aap3 gong2,1,1,90,,2021-06-08 16:47:30


# Cell Reserved for Searching

In [5]:
cards_frame[cards_frame.traditional.str.contains('一')]

Unnamed: 0,created_stamp,modified_stamp,traditional,jyutping,correct,incorrect,difficulty,priority,last_reviewed_stamp
904,2021-09-10 21:12:25,2021-09-27 23:14:11,一起,jat1 hei2,1,0,100,,2021-09-27 18:54:38
902,2021-09-01 19:32:38,2021-09-01 19:44:38,一鑊熟,jat1 wok6 suk6,1,1,90,,2021-09-09 21:34:02
897,2021-09-01 19:29:19,2021-09-01 19:44:38,一天都光晒,jat1 tin1 dou1 gwong1 saai3,1,0,100,,2021-09-05 15:09:00
825,2021-05-03 22:22:57,2021-07-23 16:14:07,等一陣,dang2 jat1 zan6,2,1,90,,2021-06-08 16:50:24
823,2021-05-03 22:22:01,2021-07-23 16:14:07,一陣,jat1 zan6,0,2,90,,2021-06-08 16:45:12
811,2021-04-26 21:10:09,2021-07-23 16:14:07,一半,jat1 bun3,2,3,80,,2021-05-27 19:26:27
809,2021-04-26 21:04:21,2021-07-23 16:14:07,郁咗一郁,juk1 zo2 jat1 juk1,2,1,90,,2021-05-24 16:44:35
749,2021-03-09 19:29:35,2021-07-23 16:14:07,一下,jat1 haa6,0,0,100,,2021-03-09 19:42:22
710,2021-02-09 21:04:08,2021-07-23 16:14:07,一條褲,jat1 tiu4 fu3,0,0,100,,2021-02-12 23:17:27
667,2021-01-20 17:04:19,2021-07-23 16:14:07,一直,jat1 zik6,2,1,90,,2021-02-09 16:09:10


# Weekly Review Stats

In [6]:
import datetime



today = pd.to_datetime("today").round('1d')
week_range = pd.date_range(today-pd.Timedelta(days=6),today,freq='d')
date2count ={"date":week_range,"total reviews":[0]*7,"date_":[x.strftime('%a %m-%d') for x in week_range]}
for card_stamp in cards_frame["last_reviewed_stamp"]:
    for day_of_week in week_range:
        if day_of_week < card_stamp < day_of_week+pd.Timedelta(days=1):
            date2count["total reviews"][list(week_range).index(day_of_week)] += 1
            break
            

            
            

datesframe = pd.DataFrame(date2count)


datesframe.plot.bar(x='date_', y='total reviews', rot=60)


ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

In [None]:
#clustering

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# Word vectors
# word_model = Word2Vec.load('wv_word')
# char_model = Word2Vec.load('wv_char')
# pos_model = Word2Vec.load('wv_char_position')

# Facebook's Chinese word vectors.
fb_model = KeyedVectors.load_word2vec_format('toastynews.vec')
vector = fb_model['一']


In [None]:
from nltk.cluster import KMeansClusterer
import nltk
items = []
for word in cards_frame.traditional:
    word = word.strip("?")
    try:
        fb_model[items + [word]]
    except:
        print(word)
        continue
    items.append(word)


X = fb_model[items]
NUM_CLUSTERS=55
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)




In [None]:
vocab_lists = {}
longest_length = 0
for i,item in enumerate(items):
    cluster = str(assigned_clusters[i])
    if cluster not in vocab_lists:
        vocab_lists[cluster] = [item]
    else:
        vocab_lists[cluster].append(item)
        if len(vocab_lists[cluster]) > longest_length:
            longest_length = len(vocab_lists[cluster])
for list_ in vocab_lists:
    vocab_lists[list_] += [""]*(longest_length - len(vocab_lists[list_]))
transposed = []
for row_n in range(0,longest_length):
    row = {}
    for listname in vocab_lists.keys():
        row[listname] = vocab_lists[listname][row_n]
    transposed.append(row)
with open("cluster_lists.csv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=vocab_lists.keys())
    writer.writeheader()
    for row in transposed:
        writer.writerow(row)









for i,item in enumerate(items):
    if assigned_clusters[i] == 1:
        print(item)