# convert pleco dump to spreadsheet

In [1]:
#import xml file
import xml.etree.ElementTree as ET
import csv
tree = ET.parse('./Report.xml')
root = tree.getroot()
card_data = []
for card in root.findall(".//card"):
    card_datum = {}
    card_datum["created_stamp"] = card.get("created")
    card_datum["modified_stamp"] = card.get("modified")
    
    entry = card.find("./entry")
    card_datum["traditional"] = entry.find("./headword[@charset='tc']").text

    if entry.find("./cantopron") is None:
        continue
    card_datum["jyutping"] = entry.find("./cantopron").text
    
    if card.find("scoreinfo") is None:
        continue
    card_datum["correct"] = int(card.find("scoreinfo").get("correct"))
    card_datum["incorrect"] = int(card.find("scoreinfo").get("incorrect"))
    card_datum["difficulty"] = int(card.find("scoreinfo").get("difficulty"))
    card_datum["priority"] = card.find("scoreinfo").get("priority")
    card_datum["last_reviewed_stamp"] = int(card.find("scoreinfo").get("lastreviewedtime"))
    card_data.append(card_datum)
    
    

    
    
#convert xml to csv
import pandas as pd
cards_frame = pd.DataFrame(card_data)
for stamp in ["last_reviewed_stamp","created_stamp","modified_stamp"]:
    cards_frame[stamp] = pd.to_datetime(cards_frame[stamp],unit='s')
    
cards_frame.sort_values("created_stamp", ascending=False,inplace=True)
cards_frame.to_csv('./Report.csv', index = False)


# View 30 Most Recently Added Vocabs

In [2]:
#30 most recently added
cards_frame.sort_values("created_stamp", ascending=False).head(30)

Unnamed: 0,created_stamp,modified_stamp,traditional,jyutping,correct,incorrect,difficulty,priority,last_reviewed_stamp
974,2021-11-09 21:10:38,2021-11-09 21:14:37,洗手盆,sai2 sau2 pun4,0,1,90,,2021-11-10 18:38:47
973,2021-11-09 20:55:14,2021-11-09 21:14:37,第,dai6,1,0,100,,2021-11-10 18:38:33
972,2021-11-09 20:44:41,2021-11-09 21:14:37,矛盾,maau4 teon5,0,1,90,,2021-11-10 18:34:41
971,2021-11-09 20:44:19,2021-11-09 21:14:37,肯,hang2,0,1,90,,2021-11-10 18:34:32
970,2021-11-09 20:28:44,2021-11-09 21:14:37,癲,din1,0,1,90,,2021-11-10 18:38:02
969,2021-11-09 20:08:17,2021-11-09 21:14:37,出軌,ceot1 gwai2,0,1,90,,2021-11-10 18:38:29
968,2021-11-02 20:09:00,2021-11-02 20:11:32,外星人,ngoi6 sing1 jan4,1,2,90,,2021-11-10 04:32:46
967,2021-11-02 20:08:43,2021-11-02 20:11:32,解釋,gaai2 sik1,0,3,90,,2021-11-10 04:33:46
966,2021-11-02 20:08:30,2021-11-02 20:11:32,辦法,baan6 faat3,0,3,90,,2021-11-10 18:29:02
965,2021-11-02 19:59:11,2021-11-02 20:11:32,花樽,faa1 zeon1,0,1,90,,2021-11-10 18:31:30


# View 30 Most Recently Reviewed Vocabs

In [3]:
#30 most recently reviewed
cards_frame.sort_values("last_reviewed_stamp", ascending=False).head(30)

Unnamed: 0,created_stamp,modified_stamp,traditional,jyutping,correct,incorrect,difficulty,priority,last_reviewed_stamp
974,2021-11-09 21:10:38,2021-11-09 21:14:37,洗手盆,sai2 sau2 pun4,0,1,90,,2021-11-10 18:38:47
973,2021-11-09 20:55:14,2021-11-09 21:14:37,第,dai6,1,0,100,,2021-11-10 18:38:33
969,2021-11-09 20:08:17,2021-11-09 21:14:37,出軌,ceot1 gwai2,0,1,90,,2021-11-10 18:38:29
961,2021-11-02 19:43:49,2021-11-02 20:11:32,隔離屋,gaak3 lei4 uk1,0,1,90,,2021-11-10 18:38:17
970,2021-11-09 20:28:44,2021-11-09 21:14:37,癲,din1,0,1,90,,2021-11-10 18:38:02
972,2021-11-09 20:44:41,2021-11-09 21:14:37,矛盾,maau4 teon5,0,1,90,,2021-11-10 18:34:41
959,2021-11-02 19:38:01,2021-11-02 20:11:32,另一,ling6 jat1,0,1,90,,2021-11-10 18:34:36
971,2021-11-09 20:44:19,2021-11-09 21:14:37,肯,hang2,0,1,90,,2021-11-10 18:34:32
950,2021-10-29 16:06:27,2021-10-29 16:09:48,比喻,bei2 jyu6,0,1,90,,2021-11-10 18:34:26
958,2021-11-02 19:37:40,2021-11-02 20:11:32,惡,wu1,0,1,90,,2021-11-10 18:34:19


# View 30 Random Vocabs

In [4]:
#30 random words
cards_frame.sample(frac=1).head(30)

Unnamed: 0,created_stamp,modified_stamp,traditional,jyutping,correct,incorrect,difficulty,priority,last_reviewed_stamp
878,2021-08-03 20:51:32,2021-08-04 12:29:59,約會,joek3 wui6,0,1,90,,2021-08-08 22:19:06
227,2020-04-22 14:04:39,2021-07-23 16:14:07,禮拜四,lai5 baai3 sei3,5,14,72,,2021-03-24 22:31:24
887,2021-09-01 19:18:54,2021-09-01 19:44:38,健康,gin6 hong1,1,1,90,,2021-09-09 21:32:38
91,2020-03-26 01:53:37,2021-07-23 16:14:07,梅青恬,mui4 cing1 tim4,13,5,100,,2020-11-03 19:53:12
836,2021-05-10 17:47:58,2021-07-23 16:14:07,騎牛搵馬,ke4 ngau4 wan2 maa5,1,1,90,,2021-06-08 16:49:45
366,2020-06-10 11:16:53,2021-07-23 16:14:07,坐,co5,3,14,60,,2021-02-07 20:09:21
2,2020-01-08 16:43:28,2021-07-23 16:14:07,同,tung4,17,9,50,,2021-03-24 22:31:31
779,2021-03-31 17:54:31,2021-07-23 16:14:07,肥牛,fei4 ngau4,2,4,80,low,2021-05-27 19:37:53
106,2020-03-28 01:54:51,2021-07-23 16:14:07,餃子,gaau2 zi2,14,17,50,,2021-03-23 14:38:18
255,2020-05-04 17:00:33,2021-07-23 16:14:07,貴,gwai3,12,12,50,,2021-08-14 15:42:53


# Cell Reserved for Searching

In [5]:
cards_frame[cards_frame.jyutping.str.contains('zaau2')]

Unnamed: 0,created_stamp,modified_stamp,traditional,jyutping,correct,incorrect,difficulty,priority,last_reviewed_stamp
936,2021-10-19 20:30:02,2021-10-20 02:59:34,找數,zaau2 sou3,0,4,90,,2021-10-27 14:57:03


# Weekly Review Stats

In [None]:
import datetime

import pandas as pd

today = pd.to_datetime("today").round('1d')
week_range = pd.date_range(today-pd.Timedelta(days=6),today,freq='d')
date2count ={"date":week_range,"total reviews":[0]*7,"date_":[x.strftime('%a %m-%d') for x in week_range]}
for card_stamp in cards_frame["last_reviewed_stamp"]:
    for day_of_week in week_range:
        if day_of_week < card_stamp < day_of_week+pd.Timedelta(days=1):
            date2count["total reviews"][list(week_range).index(day_of_week)] += 1
            break
            

            
            

datesframe = pd.DataFrame(date2count)


datesframe.plot.bar(x='date_', y='total reviews', rot=60)


<AxesSubplot:xlabel='date_'>

In [None]:
#clustering

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# Word vectors
# word_model = Word2Vec.load('wv_word')
# char_model = Word2Vec.load('wv_char')
# pos_model = Word2Vec.load('wv_char_position')

# Facebook's Chinese word vectors.
fb_model = KeyedVectors.load_word2vec_format('toastynews.vec')
vector = fb_model['一']


In [None]:
from nltk.cluster import KMeansClusterer
import nltk
items = []
for word in cards_frame.traditional:
    word = word.strip("?")
    try:
        fb_model[items + [word]]
    except:
        print(word)
        continue
    items.append(word)


X = fb_model[items]
NUM_CLUSTERS=55
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)




In [None]:
vocab_lists = {}
longest_length = 0
for i,item in enumerate(items):
    cluster = str(assigned_clusters[i])
    if cluster not in vocab_lists:
        vocab_lists[cluster] = [item]
    else:
        vocab_lists[cluster].append(item)
        if len(vocab_lists[cluster]) > longest_length:
            longest_length = len(vocab_lists[cluster])
for list_ in vocab_lists:
    vocab_lists[list_] += [""]*(longest_length - len(vocab_lists[list_]))
transposed = []
for row_n in range(0,longest_length):
    row = {}
    for listname in vocab_lists.keys():
        row[listname] = vocab_lists[listname][row_n]
    transposed.append(row)
with open("cluster_lists.csv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=vocab_lists.keys())
    writer.writeheader()
    for row in transposed:
        writer.writerow(row)









for i,item in enumerate(items):
    if assigned_clusters[i] == 1:
        print(item)