# Random sampling

purpose: random sampling the word tokens from the corpora, so we have the equal sampling size \
Looping through different sample sizes: \
from 100 to 2000, with a step = 100 \
so, we can have varying sample sizes for different conditions/groups

## with interjections

from 100 to 2000, with a step = 100 
- OVERALL_TXT = "clean_parent_all.txt" (total words: 7296)
- CV_TXT = "CV_clean_overall_condition.txt" (total words: 3992)
- DG_TXT = "DG_clean_overall_condition.txt" (total words: 3279)
- CONTI_TXT = "contingency_clean_overall_contingency.txt" (total words: 2424)
- NONCON_TXT = "non-contingency_clean_overall_contingency.txt"  (total words: 4845) 


from 100 to 900, with a step = 100 
- CV_C_TXT = "CV-1_clean_overall_contigent" (total words: 2558)
- CV_NC_TXT = "CV-0_clean_overall_Ncontigent" (total words: 1434) 
- DG_C_TXT = "DG-1_clean_overall_contigent" (total words: 2289)
- DG_NC_TXT = "DG-0_clean_overall_Ncontigent" (total words: 990)

## without interjections & sound-like words

- OVERALL_TXT = "clean_pooling_all_clean_woint.txt" (total words: 6816)
- CV_TXT = "CV_pooling_clean_woint.txt" (total words: 3725)
- DG_TXT = "DG_pooling_clean_woint.txt" (total words: 3065)
- CONTI_TXT = "1_C_pooling_clean_woint.txt" (total words: 2245)
- NONCON_TXT = "0_NC_pooling_clean_woint.txt"  (total words: 4541) 


In [1]:
import random
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [5]:
# This chunck is for with interjections
# FOLDER_PATH = "C:/Users/USER/PycharmProjects/UniqueWordCalculator/clean_data"
# OVERALL_TXT = "clean_all_text.txt"
# CV_TXT = "CV_clean_pooling.txt"
# DG_TXT = "DG_clean_pooling.txt"
# CONTI_TXT = "1_C_clean_pooling.txt"
# NONCON_TXT = "0_NC_clean_pooling.txt"
# CV_C_TXT = "CV-1_clean_overall_contigent.txt" 
# CV_NC_TXT = "CV-0_clean_overall_Ncontigent.txt" 
# DG_C_TXT = "DG-1_clean_overall_contigent.txt" 
# DG_NC_TXT = "DG-0_clean_overall_Ncontigent.txt" 
# OUTPUT_PATH = "C:/Users/USER/PycharmProjects/UniqueWordCalculator/output/inc_interjections"

In [2]:
# this chunk is for without interjections
FOLDER_PATH = "C:/Users/USER/PycharmProjects/UniqueWordCalculator/parent-condition/pooling-data/wo_interjections"
OVERALL_TXT = "clean_pooling_all_clean_woint.txt"

# condition
CV_TXT = "CV_pooling_clean_woint.txt"
DG_TXT = "DG_pooling_clean_woint.txt"

# speech type
CONTI_TXT = "1_C_pooling_clean_woint.txt"
NONCON_TXT = "0_NC_pooling_clean_woint.txt"

# condition x speech type
CV_C_TXT = "CV-1_pooling_clean_woint.txt" 
CV_NC_TXT = "CV-0_pooling_clean_woint.txt" 
DG_C_TXT = "DG-1_pooling_clean_woint.txt" 
DG_NC_TXT = "DG-0_pooling_clean_woint.txt" 

OUTPUT_PATH = "C:/Users/USER/PycharmProjects/UniqueWordCalculator/output/wo_interjections"

In [112]:
CURRENT_CON = NONCON_TXT

In [113]:
FILENAME = FOLDER_PATH + "/" + CURRENT_CON
FILENAME

'C:/Users/USER/PycharmProjects/UniqueWordCalculator/parent-condition/pooling-data/wo_interjections/0_NC_pooling_clean_woint.txt'

In [114]:
with open(FILENAME) as all_file:
    all_content = all_file.readlines()
#     all_content = [token.strip('\n') for token in all_content]


# iteration for  3 times

1. loop from starting size to end size with steps
2. repeat this method for 3 times

In [115]:
STARTING_SIZE = 2000
ENDING_SIZE = 2000
STEP = 1
ITER_COUNT = 3

In [116]:
# Store results in DataFrame
columns = ['sample_size', 'iteration', 'word', 'rank', 'frequency']
results = pd.DataFrame(columns=columns)

In [117]:
# Iterating for 50 times
for iter_num in range(1, ITER_COUNT + 1):
    # Randomly sampling from 100 to 2000, with step 100
    for sample_size in range(STARTING_SIZE, ENDING_SIZE + STEP, STEP):
        sampled_words = random.sample(all_content, sample_size)
        
        # Count the frequency of words
        word_freq = Counter(sampled_words)
        
        # Sort by frequency and assign rank
        ranked_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
        
        # Append results to DataFrame
        for rank, (word, freq) in enumerate(ranked_words, start=1):
            results = results.append({'sample_size': sample_size, 'iteration': iter_num,
                                       'word': word, 'rank': rank, 'frequency': freq}, ignore_index=True)

In [118]:
print(results)

outputfile = f'output_frequrank_{CURRENT_CON}_it03.csv'
results.to_csv(outputfile, index=False)

    sample_size iteration        word rank frequency
0          2000         1        be\n    1       268
1          2000         1  peekaboo\n    2       183
2          2000         1     where\n    3       180
3          2000         1       you\n    4       112
4          2000         1       boo\n    5       103
..          ...       ...         ...  ...       ...
499        2000         3      need\n  163         1
500        2000         3     watch\n  164         1
501        2000         3      sing\n  165         1
502        2000         3      sigh\n  166         1
503        2000         3     child\n  167         1

[504 rows x 5 columns]


# Speech type x condition

1. starting from 450 to 900
2. repeat 4 times

In [135]:
STARTING_SIZE = 450
ENDING_SIZE = 900
STEP = 450
ITER_COUNT = 4

In [136]:
# Store results in DataFrame
columns = ['sample_size', 'iteration', 'word', 'rank', 'frequency']
results = pd.DataFrame(columns=columns)

In [137]:
# Iterating for 4 times
for iter_num in range(1, ITER_COUNT + 1):
    # Randomly sampling from 100 to 2000, with step 100
    for sample_size in range(STARTING_SIZE, ENDING_SIZE + STEP, STEP):
        sampled_words = random.sample(all_content, sample_size)
        
        # Count the frequency of words
        word_freq = Counter(sampled_words)
        
        # Sort by frequency and assign rank
        ranked_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
        
        # Append results to DataFrame
        for rank, (word, freq) in enumerate(ranked_words, start=1):
            results = results.append({'sample_size': sample_size, 'iteration': iter_num,
                                       'word': word, 'rank': rank, 'frequency': freq}, ignore_index=True)

In [138]:
print(results)

outputfile = f'output_frequrank_{CURRENT_CON}.csv'
results.to_csv(outputfile, index=False)

    sample_size iteration        word rank frequency
0           450         1        be\n    1        61
1           450         1  peekaboo\n    2        35
2           450         1     where\n    3        34
3           450         1       boo\n    4        32
4           450         1       you\n    5        27
..          ...       ...         ...  ...       ...
759         900         4     leave\n  106         1
760         900         4      baby\n  107         1
761         900         4    little\n  108         1
762         900         4      know\n  109         1
763         900         4       boy\n  110         1

[764 rows x 5 columns]
