In [1]:
import pandas as pd
import numpy as np
import re

import spacy
nlp = spacy.load('en_core_web_sm')


In [2]:
path = "data/filecommits.csv"
protocols = pd.read_csv(path, index_col=0)

In [3]:
from datetime import datetime
protocols["ts"] = protocols.date_commit.apply(lambda x: pd.datetime.fromtimestamp(x).date())

In [4]:
def remove_urls(s):
    s = re.sub('[^\s]*.com[^\s]*', "", s)
    s = re.sub('[^\s]*www.[^\s]*', "", s)
    s = re.sub('[^\s]*.co.uk[^\s]*', "", s)
    return s

In [5]:
protocols['clean_text'] = protocols['text'].map(remove_urls)

In [6]:
#Tokenising with Spacy
protocols['tokens_clean_text'] = protocols['clean_text'].map(nlp)


In [7]:
protocols['sentences_clean_text'] = protocols['tokens_clean_text'].apply(lambda toks: list(toks.sents))

In [8]:
for token in protocols['tokens_clean_text'][0]:
    print (token, token.idx, token.text_with_ws,token.is_alpha, token.is_punct, token.is_space,token.shape_, token.is_stop)
    break
    

Day 0 Day  True False False Xxx False


In [11]:
#things we want to do:

# Entities and Sentiment? :)


# get number of tokens per document
protocols["token_count"] = protocols['tokens_clean_text'].map(len)
# get number of sentences
protocols["sentence_count"] = protocols['sentences_clean_text'].map(len)

# get number of words that are not punctions and so on
protocols["content_words_alpha"] = protocols['tokens_clean_text'].apply(lambda toks: [token for token in toks if token.is_alpha is True and token.is_stop is False and token.is_punct is False])

protocols["content_word_count"] = protocols['content_words_alpha'].map(len)

protocols["content_words_alpha_len"] = protocols["content_words_alpha"].apply(lambda words: [len(word) for word in words if len(word) > 2])
# get avg word length
protocols["avg_word_len"] = protocols["content_words_alpha_len"].apply(lambda lens: sum(lens)/len(lens))
#median word length
protocols["median_word_len"] = protocols["content_words_alpha_len"].apply(lambda lens: sorted(lens)[len(lens)//2 - 1])

# get median word length


In [13]:
#getting the cardinality of docs per person
#sort by date/filenam
protocols = protocols.sort_values(["ts", "filename"]).reset_index(drop=True)


In [14]:
protocols['author_day'] = protocols.groupby('author').cumcount()

In [15]:
protocols[["author", "author_day"]]

Unnamed: 0,author,author_day
0,Erin Robinson,0
1,Bonnie Brown,0
2,Timothy Stevens,0
3,Larry Sanders,0
4,Zachary Brooks,0
5,William Rodriguez,0
6,Amy Williams,0
7,Bonnie Williams,0
8,Nicole Johnson,0
9,Thomas Hansen,0


In [16]:
protocols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   filename                 39 non-null     object 
 1   author                   39 non-null     object 
 2   date_commit              39 non-null     int64  
 3   text                     39 non-null     object 
 4   ts                       39 non-null     object 
 5   clean_text               39 non-null     object 
 6   tokens_clean_text        39 non-null     object 
 7   sentences_clean_text     39 non-null     object 
 8   token_count              39 non-null     int64  
 9   sentence_count           39 non-null     int64  
 10  content_words_alpha      39 non-null     object 
 11  content_word_count       39 non-null     int64  
 12  content_words_alpha_len  39 non-null     object 
 13  avg_word_len             39 non-null     float64
 14  median_word_len          39 

In [17]:
source = protocols[["author_day", "author", "sentence_count", "median_word_len"]]

source['author_day'] = source['author_day'] + 1
source.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   author_day       39 non-null     int64 
 1   author           39 non-null     object
 2   sentence_count   39 non-null     int64 
 3   median_word_len  39 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 1.3+ KB


In [51]:
protocols[["author_day", "author", "sentence_count", "token_count"]].sum()

author_day                                                       39
author            Erin RobinsonBonnie BrownTimothy StevensLarry ...
sentence_count                                                 2441
token_count                                                   31737
dtype: object

In [53]:
import altair as alt

alt.Chart(source).mark_circle(
    opacity=0.8,
    stroke='black',
    strokeWidth=1
).encode(
    alt.X('author_day:Q', scale=alt.Scale(domain=[0.5, 3.5]),axis=alt.Axis(grid=False,tickMinStep=1), title='Protocol count'),
    alt.Y('author:N',axis=alt.Axis(grid=True),  title=" "),
    alt.Size('sentence_count',
        scale=alt.Scale(range=[0, 3000]),
        legend=alt.Legend(title='sentence count')
    ),
   color = alt.Color('median_word_len', legend=alt.Legend(title='median word length'),  scale=alt.Scale(scheme='lightmulti')),
   tooltip=[
        alt.Tooltip('median_word_len:Q', title='Word len'),
        alt.Tooltip('sentence_count:Q', title='Sentence count')
    ]
).properties(
    width=500,
    height=500
).properties(height=400, width=550, title='Sentences per protocol by student').configure(font='OpenSans').configure_legend(
    symbolSize=150,
    labelFontSize=15,
    titleFontSize=15,
    padding = 40
).configure_title(
    fontSize=20,
    color='#000000',
).configure_legend(
    symbolSize=100,
    labelFontSize=15
)

In [114]:
import altair as alt

color = alt.Color('author', scale=scale)

bubbles = alt.Chart().mark_circle(
    opacity=0.8,
    stroke='black',
    strokeWidth=1
).encode(
    alt.X('author_day', scale=alt.Scale(domain=[0, 4]),axis=alt.Axis(grid=False) ),
    alt.Y('median_word_len',axis=alt.Axis(grid=True)),
    alt.Size('sentence_count',
        scale=alt.Scale(range=[0, 3000]),
        legend=alt.Legend(title='sentence count')
    ),
   color = alt.condition(brush, color, alt.value('lightgray')),
   tooltip=[
        alt.Tooltip('median_word_len:Q', title='Word len'),
        alt.Tooltip('sentence_count:Q', title='Sentence count')
    ]
).properties(
    width=500,
    height=500
)


# Bottom panel is a bar chart of weather type
bars = alt.Chart().mark_bar().encode(
    x='count()',
    y='author',
    color=alt.condition(click, color, alt.value('lightgray')),
).transform_filter(
    brush
).properties(
    width=550,
).add_selection(
    click
)


alt.vconcat(
    bubbles,
    bars,
    data=source,
    title="neuefische DS-4 protocols 2020"
)




In [None]:
import altair as alt
from vega_datasets import data

source = data.disasters.url

alt.Chart(source).mark_circle(
    opacity=0.8,
    stroke='black',
    strokeWidth=1
).encode(
    alt.X('Year:O', axis=alt.Axis(labelAngle=0)),
    alt.Y('Entity:N'),
    alt.Size('Deaths:Q',
        scale=alt.Scale(range=[0, 4000]),
        legend=alt.Legend(title='Annual Global Deaths')
    ),
    alt.Color('Entity:N', legend=None)
).properties(
    width=450,
    height=320
).transform_filter(
    alt.datum.Entity != 'All natural disasters'
)