In [77]:
import pyarrow.parquet as pa
import pandas as pd
from ipywidgets import AppLayout, GridspecLayout, Button, Layout

In [78]:
table = pa.read_table("Midjourney2023_Upscale.parquet")

In [79]:
table

pyarrow.Table
prompt: string
user: string
timestamp: timestamp[ns, tz=UTC]
image_url: string
----
prompt: [["a30 year old Australian male in a bold grey top, no filter, at a laptop in a professional marketing agency setting with pale orange accents, editorial style photo","a30 year old Australian male in a bold grey top, no filter, at a laptop in a professional marketing agency setting with pale orange accents, editorial style photo","a30 year old Australian male in a bold grey top, no filter, at a laptop in a professional marketing agency setting with pale orange accents, editorial style photo","a30 year old Australian male in a bold grey top, no filter, at a laptop in a professional marketing agency setting with pale orange accents, editorial style photo","a millennial Australian male in a bold grey top, no filter, at a laptop in an agency setting with soft red accents, editorial style photo",...,"Dissociative personality disorder, Korean woman raw   ","Dissociative personality disor

In [80]:
table.shape

(973134, 4)

In [81]:
df = table.to_pandas()  
df.head().T

Unnamed: 0,0,1,2,3,4
prompt,a30 year old Australian male in a bold grey to...,a30 year old Australian male in a bold grey to...,a30 year old Australian male in a bold grey to...,a30 year old Australian male in a bold grey to...,a millennial Australian male in a bold grey to...
user,838569662941102110,838569662941102110,838569662941102110,838569662941102110,838569662941102110
timestamp,2023-11-04 08:05:34.078000+00:00,2023-11-04 08:05:26.130000+00:00,2023-11-04 08:05:24.309000+00:00,2023-11-04 08:05:21.934000+00:00,2023-11-04 08:04:54.067000+00:00
image_url,https://cdn.discordapp.com/attachments/9954314...,https://cdn.discordapp.com/attachments/9954314...,https://cdn.discordapp.com/attachments/9954314...,https://cdn.discordapp.com/attachments/9954314...,https://cdn.discordapp.com/attachments/9954314...


In [82]:
df.dtypes

prompt                    object
user                      object
timestamp    datetime64[ns, UTC]
image_url                 object
dtype: object

In [83]:
df["timestamp"] = pd.to_datetime(df["timestamp"])
first_timestamp = df["timestamp"].min()
last_timestamp = df["timestamp"].max()
print(f"First timestamp: {first_timestamp}")
print(f"Last timestamp: {last_timestamp}")

First timestamp: 2023-10-24 03:56:41.108000+00:00
Last timestamp: 2023-11-09 03:33:57.146000+00:00


## Prompt Data

In [84]:
#This counts characters, not words. Oop.
df["promptLength"] = df["prompt"].str.len()
df["promptLength"].describe()

count    973134.000000
mean        177.577442
std         206.590229
min           1.000000
25%          67.000000
50%         118.000000
75%         208.000000
max        1800.000000
Name: promptLength, dtype: float64

In [85]:
#Intended to count words per prompt, misunderstood what code was doing. 
#From https://stackoverflow.com/questions/37483470/how-to-calculate-number-of-words-in-a-string-in-dataframe

df["wordCount"] = df["prompt"].str.split().apply(len).value_counts()
df["wordCount"].describe()

count      310.000000
mean      3139.141935
std       7509.070470
min          1.000000
25%         49.000000
50%        164.500000
75%       1120.000000
max      35530.000000
Name: wordCount, dtype: float64

In [86]:
#Actually counting words per prompt
df["word_count"] = df["prompt"].str.split().str.len()
df["word_count"].describe()

count    973134.000000
mean         25.990573
std          29.710369
min           1.000000
25%          10.000000
50%          18.000000
75%          31.000000
max         377.000000
Name: word_count, dtype: float64

**Minimum word count**: 1
**Max word count**: 377 

In [87]:
df["prompt"][1346]

'(photorealistic) (masterpiece) Anime style. Scene from an anime movie. Shinichi Aizawa, a 20 year-old male student with slightly unruly black hair and blue eyes wearing a black tee-shirt, blue jeans and brown shoes.sitting back on the ground. He is facing a single talking red fox that is sitting in front of him. He looks panickedat being addressed by the animal. They are surrounded by high golden grain. Blue sky with streaky pink and white clouds. In the style of Don Bluth.  --niji'

In [88]:
containsNiji = df["prompt"].str.contains("niji", case=False, na=False)
filteredForNiji = df[containsNiji] 

In [89]:
#How many prompts are (correctly) using parameters? Aspect ratio, chaos, no, style, weird
#How many prompts are incorrectly trying to use parameters? ie, no -- 
#How many prompts are using older versions? --niji 4, --niji 6, --version/--v <1, 2, 3, 4, 5, 5.1, 5.2> 

In [90]:
print(filteredForNiji)

                                                   prompt                user  \
694     niji 5, Halloween, monsters, tattoo design, si...  878376022184169524   
1151    a male human angel with wings made of golden l...  353023489541931009   
1156    a male human angel with wings made of golden l...  353023489541931009   
1157    a male human angel with wings made of golden l...  353023489541931009   
1346    (photorealistic) (masterpiece) Anime style. Sc...  726804658579963946   
...                                                   ...                 ...   
971042  (msterpiece) (highly detailed) a teenage detec...  726804658579963946   
971051  (masterpiece) Full figures visible. Dynamic po...  726804658579963946   
971052  (masterpiece) Full figures visible. Dynamic po...  726804658579963946   
971053  (masterpiece) Full figures visible. Dynamic po...  726804658579963946   
971054  (masterpiece) Full figures visible. Dynamic po...  726804658579963946   

                           

---
Need to remember to be mindful of the columns I'm adding...

In [91]:
#Any correlation between prompt length and reuse?

In [92]:
#How many prompts are repeated without changes?
#How many prompts are repeated with only parameter changes?
#How many prompts are repeated with only a single word change? (this could be tricky with determining the addition of, subtraction of, etc)

## User Data
---

In [93]:
uniqueUsers = df["user"].nunique()
uniqueUsers

24487

In [94]:
#How many users posted everyday? 
#Unique users per day?

In [95]:
promptsPerUser = df["user"].value_counts()
promptsPerUser

user
1133120398163193937    8666
1142211736116662343    7327
140316570487488512     6868
1017944538330959934    4200
598593981537976372     4133
                       ... 
971143636362358835        1
1079337279329407026       1
1092071164161048717       1
1064717812700028938       1
1124031946381611079       1
Name: count, Length: 24487, dtype: int64

In [96]:
#How many users are using the same prompt that another user prompted?

In [97]:
#Look at how many users only posted 1 time

## Time Data
---

In [98]:
#How many prompts per day?
#Frequency over 24 hour period?

## Visuals
---

In [110]:
def create_expanded_button(description, button_style):
    return Button(description=description, button_style=button_style, layout=Layout(height='auto', width='auto'))
    

grid = GridspecLayout(4, 3, height='300px')
grid[:3, 1:] = create_expanded_button('Word Cloud', 'info')
grid[:, 0] = create_expanded_button('Stats', 'info')
grid[3, 1] = create_expanded_button('Drop Down 1 - Day', 'info')
grid[3, 2] = create_expanded_button('Slider', 'info')

grid

#

GridspecLayout(children=(Button(button_style='info', description='Word Cloud', layout=Layout(grid_area='widget…

## NLP???
---

Allegedly [Textblob](https://textblob.readthedocs.io/en/dev/index.html) is a decent beginner natural language processor. I keep seeing NLPs pop up anytime I search for information related to text analysis.