In [None]:
import pandas as pd
import numpy as np

# Pandas 사용

## input.small Data

In [None]:
# input.small 데이터 불러오기
df = pd.read_csv("/content/sample_data/input.small", sep='\t')

print(df.shape)
df[:3]

(49, 1)


Unnamed: 0,"174690 Times online Chirac stated that ""The Republic is not a dictatorship of rumours, a dictatorship of calumny."""
0,211774 The English Suites closely follow the t...
1,60669 A lens must be used in order to form a c...
2,"209325 David B. Miller, ""The Velikie Minei Che..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 1 columns):
 #   Column                                                                                                              Non-Null Count  Dtype 
---  ------                                                                                                              --------------  ----- 
 0   174690 Times online Chirac stated that "The Republic is not a dictatorship of rumours, a dictatorship of calumny."  49 non-null     object
dtypes: object(1)
memory usage: 520.0+ bytes


In [None]:
# 칼럼명 변경
df.rename(columns={'174690 Times online Chirac stated that "The Republic is not a dictatorship of rumours, a dictatorship of calumny."' : 'sentences'}, inplace=True)
df[:1]

Unnamed: 0,sentences
0,211774 The English Suites closely follow the t...


In [None]:
# 칼럼명에 있던 문장을 새로운 행으로 추가
df.loc[49] = ['174690 Times online Chirac stated that "The Republic is not a dictatorship of rumours, a dictatorship of calumny."']
df

Unnamed: 0,sentences
0,211774 The English Suites closely follow the t...
1,60669 A lens must be used in order to form a c...
2,"209325 David B. Miller, ""The Velikie Minei Che..."
3,245837 According to BYU professor Marvin S. Hi...
4,25190 Access to higher education was an early ...
5,175832 The Borg opened fire on the Saratoga le...
6,128347 Seth decides to become human through th...
7,"173974 Lastly, on 5 April, she sent in to Plym..."
8,169956 This technique can be used to effective...
9,130579 The Sun Folk Compared to the Wolfriders...


In [None]:
# 모든 문장을 소문자로 변경
df['sentences'] = df['sentences'].str.lower()
df[:1]

Unnamed: 0,sentences
0,211774 the english suites closely follow the t...


In [None]:
# 각 문장 내의 특수문자를 띄어쓰기로 변경
import re
df['sentences'] = df['sentences'].str.replace("[^a-zA-Z0-9]", ' ')
df[:3]

  df['sentences'] = df['sentences'].str.replace("[^a-zA-Z0-9]", ' ')


Unnamed: 0,sentences
0,211774 the english suites closely follow the t...
1,60669 a lens must be used in order to form a c...
2,209325 david b miller the velikie minei che...


In [None]:
num = []
sen = []

for i in range(len(df)):
  a = df['sentences'][i][:6]
  b = df['sentences'][i][6:]
  num.append(a)
  sen.append(b)

df['#'] = num

df.drop(columns='sentences', inplace=True)
df['sentences'] = sen

df[:3]

Unnamed: 0,#,sentences
0,211774,the english suites closely follow the traditi...
1,60669,a lens must be used in order to form a collima...
2,209325,david b miller the velikie minei chetii an...


In [None]:
# 각 문장을 토큰화
df['tokens'] = df['sentences'].str.split()
df[:3]

Unnamed: 0,#,sentences,tokens
0,211774,the english suites closely follow the traditi...,"[the, english, suites, closely, follow, the, t..."
1,60669,a lens must be used in order to form a collima...,"[a, lens, must, be, used, in, order, to, form,..."
2,209325,david b miller the velikie minei chetii an...,"[david, b, miller, the, velikie, minei, chetii..."


In [None]:
freq = pd.DataFrame([], columns=['#', 'words', 'frequency'])
freq[:1]

Unnamed: 0,#,words,frequency


In [None]:
t = pd.DataFrame(df['tokens'].iloc[0]).value_counts()
pd.DataFrame(t, columns=['frequency']).reset_index()
#.sort_values(by='frequency', ascending=False)

Unnamed: 0,0,frequency
0,the,5
1,a,2
2,and,2
3,including,1
4,suites,1
5,single,1
6,sarabande,1
7,prelude,1
8,movement,1
9,model,1


In [None]:
# 몇번 문장에서 특정 단어가 몇번 나오는지 count 하는 데이터프레임 생성
for i in range(len(df)):
  a = pd.DataFrame(df['tokens'].iloc[i]).value_counts()
  b = pd.DataFrame(a, columns=['frequency']).reset_index()
  b.columns = ['words', 'frequency']
  b['#'] = df['#'].iloc[i]
  freq = pd.concat([freq, b], axis=0)

# '#' 칼럼으로 정렬, frequency 로 내림차순
freq.sort_values(by=['#', 'words', 'frequency'], ascending=[True, True, False])

print(freq.shape)
freq[:5]

(982, 3)


Unnamed: 0,#,words,frequency
0,211774,the,5
1,211774,a,2
2,211774,and,2
3,211774,including,1
4,211774,suites,1


In [None]:
# 단어 입력시 해당 단어가 속한 문장번호와 문장 내에서 단어의 빈도수를 출력
search = input('단어를 입력하세요: ')
freq[freq['words'] == search]

단어를 입력하세요: is


Unnamed: 0,#,words,frequency
8,235303,is,1
21,93337,is,1
11,184878,is,1
12,57315,is,1
16,231722,is,1
5,174690,is,1


## input.big Data

In [None]:
# input.big 데이터 불러오기
df = pd.read_csv("/content/sample_data/input.big", sep='\t')

print(df.shape)
df[:3]

(279628, 1)


Unnamed: 0,"174690 Times online Chirac stated that ""The Republic is not a dictatorship of rumours, a dictatorship of calumny."""
0,211774 The English Suites closely follow the t...
1,60669 A lens must be used in order to form a c...
2,"209325 David B. Miller, ""The Velikie Minei Che..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279628 entries, 0 to 279627
Data columns (total 1 columns):
 #   Column                                                                                                              Non-Null Count   Dtype 
---  ------                                                                                                              --------------   ----- 
 0   174690 Times online Chirac stated that "The Republic is not a dictatorship of rumours, a dictatorship of calumny."  279628 non-null  object
dtypes: object(1)
memory usage: 2.1+ MB


In [None]:
# 칼럼명 변경
df.rename(columns={'174690 Times online Chirac stated that "The Republic is not a dictatorship of rumours, a dictatorship of calumny."' : 'sentences'}, inplace=True)
df[:1]

Unnamed: 0,sentences
0,211774 The English Suites closely follow the t...


In [None]:
# 칼럼명에 있던 문장을 새로운 행으로 추가
df.loc[279628] = ['174690 Times online Chirac stated that "The Republic is not a dictatorship of rumours, a dictatorship of calumny."']
df

Unnamed: 0,sentences
0,211774 The English Suites closely follow the t...
1,60669 A lens must be used in order to form a c...
2,"209325 David B. Miller, ""The Velikie Minei Che..."
3,245837 According to BYU professor Marvin S. Hi...
4,25190 Access to higher education was an early ...
...,...
279624,134456 The plant faced criticism in the past d...
279625,"190067 Sideshow: Kissinger, Nixon and the Dest..."
279626,89422 This is probably from the influence of a...
279627,207486 Readings on the exceptional child.


In [None]:
# 모든 문장을 소문자로 변경
df['sentences'] = df['sentences'].str.lower()
df[:1]

Unnamed: 0,sentences
0,211774 the english suites closely follow the t...


In [None]:
# 각 문장 내의 특수문자를 띄어쓰기로 변경
import re
df['sentences'] = df['sentences'].str.replace("[^a-zA-Z0-9]", ' ')
df[:3]

  df['sentences'] = df['sentences'].str.replace("[^a-zA-Z0-9]", ' ')


Unnamed: 0,sentences
0,211774 the english suites closely follow the t...
1,60669 a lens must be used in order to form a c...
2,209325 david b miller the velikie minei che...


In [None]:
num = []
sen = []

for i in range(len(df)):
  a = df['sentences'][i][:6]
  b = df['sentences'][i][6:]
  num.append(a)
  sen.append(b)

df['#'] = num

df.drop(columns='sentences', inplace=True)
df['sentences'] = sen

df[:3]

Unnamed: 0,#,sentences
0,211774,the english suites closely follow the traditi...
1,60669,a lens must be used in order to form a collima...
2,209325,david b miller the velikie minei chetii an...


In [None]:
# 각 문장을 토큰화
df['tokens'] = df['sentences'].str.split()
df[:3]

Unnamed: 0,#,sentences,tokens
0,211774,the english suites closely follow the traditi...,"[the, english, suites, closely, follow, the, t..."
1,60669,a lens must be used in order to form a collima...,"[a, lens, must, be, used, in, order, to, form,..."
2,209325,david b miller the velikie minei chetii an...,"[david, b, miller, the, velikie, minei, chetii..."


In [None]:
freq = pd.DataFrame([], columns=['#', 'words', 'frequency'])
freq[:1]

Unnamed: 0,#,words,frequency


In [None]:
# 몇번 문장에서 특정 단어가 몇번 나오는지 count 하는 데이터프레임 생성
for i in range(len(df)):
  a = pd.DataFrame(df['tokens'].iloc[i]).value_counts()
  b = pd.DataFrame(a, columns=['frequency']).reset_index()
  b.columns = ['words', 'frequency']
  b['#'] = df['#'].iloc[i]
  freq = pd.concat([freq, b], axis=0)

# '#' 칼럼으로 정렬, frequency 로 내림차순
freq.sort_values(by=['#', 'words', 'frequency'], ascending=[True, True, False])

print(freq.shape)
freq[:5]

(5027705, 3)


Unnamed: 0,#,words,frequency
0,211774,the,5
1,211774,a,2
2,211774,and,2
3,211774,including,1
4,211774,suites,1


In [None]:
# 결과물 csv 파일로 저장
freq.to_csv('/content/drive/MyDrive/3차 프로젝트 (230125 ~ 230222)/1주차 과제/input_big_frequency.csv')

In [None]:
# 단어 입력시 해당 단어가 속한 문장번호와 문장 내에서 단어의 빈도수를 출력
search = input('단어를 입력하세요: ')
freq[freq['words'] == search]

단어를 입력하세요: is


Unnamed: 0,#,words,frequency
8,235303,is,1
21,93337,is,1
11,184878,is,1
12,57315,is,1
16,231722,is,1
...,...,...,...
14,246741,is,1
0,247627,is,2
20,187510,is,1
6,89422,is,1


# Numpy 사용

## input.small Data

In [2]:
import pandas as pd
import numpy as np

In [4]:
# input.small 데이터 불러오기
df = pd.read_csv("/content/sample_data/input.small", sep='\t')

print(df.shape)
df[:3]

(49, 1)


Unnamed: 0,"174690 Times online Chirac stated that ""The Republic is not a dictatorship of rumours, a dictatorship of calumny."""
0,211774 The English Suites closely follow the t...
1,60669 A lens must be used in order to form a c...
2,"209325 David B. Miller, ""The Velikie Minei Che..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 1 columns):
 #   Column                                                                                                              Non-Null Count  Dtype 
---  ------                                                                                                              --------------  ----- 
 0   174690 Times online Chirac stated that "The Republic is not a dictatorship of rumours, a dictatorship of calumny."  49 non-null     object
dtypes: object(1)
memory usage: 520.0+ bytes


In [6]:
# 칼럼명 변경
df.rename(columns={'174690 Times online Chirac stated that "The Republic is not a dictatorship of rumours, a dictatorship of calumny."' : 'sentences'}, inplace=True)

# 칼럼명에 있던 문장을 새로운 행으로 추가
df.loc[49] = ['174690 Times online Chirac stated that "The Republic is not a dictatorship of rumours, a dictatorship of calumny."']

df

Unnamed: 0,sentences
0,211774 The English Suites closely follow the t...
1,60669 A lens must be used in order to form a c...
2,"209325 David B. Miller, ""The Velikie Minei Che..."
3,245837 According to BYU professor Marvin S. Hi...
4,25190 Access to higher education was an early ...
5,175832 The Borg opened fire on the Saratoga le...
6,128347 Seth decides to become human through th...
7,"173974 Lastly, on 5 April, she sent in to Plym..."
8,169956 This technique can be used to effective...
9,130579 The Sun Folk Compared to the Wolfriders...


In [7]:
# 모든 문장을 소문자로 변경
df['sentences'] = df['sentences'].str.lower()

# 각 문장 내의 특수문자를 띄어쓰기로 변경
import re
df['sentences'] = df['sentences'].str.replace("[^a-z0-9]", ' ')
df[:3]

  df['sentences'] = df['sentences'].str.replace("[^a-z0-9]", ' ')


Unnamed: 0,sentences
0,211774 the english suites closely follow the t...
1,60669 a lens must be used in order to form a c...
2,209325 david b miller the velikie minei che...


In [8]:
# 띄어쓰기 단위로 분리
df['sentences'] = df['sentences'].str.split()
df[:1]

Unnamed: 0,sentences
0,"[211774, the, english, suites, closely, follow..."


In [9]:
a = pd.DataFrame(df['sentences'][0][1:]).value_counts()
pd.DataFrame(a, columns=['fre']).reset_index()

Unnamed: 0,0,fre
0,the,5
1,a,2
2,and,2
3,including,1
4,suites,1
5,single,1
6,sarabande,1
7,prelude,1
8,movement,1
9,model,1


In [10]:
df['sentences'].iloc[0][0]

'211774'

In [11]:
small_df = pd.DataFrame([], columns=['id', 'words', 'frequency'])

for i in range(len(df)):
  a = pd.DataFrame(df['sentences'][i][1:]).value_counts()
  b = pd.DataFrame(a, columns=['fre']).reset_index()
  b.columns = ['words', 'frequency']
  b['id'] = df['sentences'].iloc[i][0]

  small_df = pd.concat([small_df, b], axis=0)

small_df

Unnamed: 0,id,words,frequency
0,211774,the,5
1,211774,a,2
2,211774,and,2
3,211774,including,1
4,211774,suites,1
...,...,...,...
9,174690,rumours,1
10,174690,stated,1
11,174690,that,1
12,174690,the,1


In [12]:
# 결과물 csv 파일로 저장
small_df.to_csv('/content/drive/MyDrive/3차 프로젝트 (230125 ~ 230222)/1주차 과제/input_small_frequency.csv')

## 딕셔너리 사용

In [9]:
df[:3]

Unnamed: 0,sentences
0,"[211774, the, english, suites, closely, follow..."
1,"[60669, a, lens, must, be, used, in, order, to..."
2,"[209325, david, b, miller, the, velikie, minei..."


In [18]:
#unique, counts =
np.unique(df['sentences'][0], return_counts=True)

(array(['211774', 'a', 'adding', 'allemande', 'and', 'before', 'between',
        'closely', 'english', 'follow', 'gigue', 'including', 'model',
        'movement', 'prelude', 'sarabande', 'single', 'suites', 'the',
        'traditional'], dtype='<U11'),
 array([1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1]))

In [39]:
freq_dict = {}

for i in range(10):
  unique, counts = np.unique(df['sentences'][i][1:], return_counts=True)
  id = int(df['sentences'][i][0])
  #a = {unique : dict(zip(id, counts))}
  a = {unique : dict(zip(counts, id))}
  freq_dict.update(a)

freq_dict
#sorted(freq_dict.items(), reverse=True)[:3]

TypeError: ignored

In [40]:
# 몇번 문장에서 특정 단어가 몇번 나오는지 count 하는 딕셔너리 생성
freq_dict = {}

for i in range(len(df)):
  unique, counts = np.unique(df['sentences'][i][1:], return_counts=True)
  a = {int(df['sentences'][i][0]) : dict(zip(unique, counts))}
  freq_dict.update(a)

freq_dict
#sorted(freq_dict.items(), reverse=True)[:3]

{211774: {'a': 2,
  'adding': 1,
  'allemande': 1,
  'and': 2,
  'before': 1,
  'between': 1,
  'closely': 1,
  'english': 1,
  'follow': 1,
  'gigue': 1,
  'including': 1,
  'model': 1,
  'movement': 1,
  'prelude': 1,
  'sarabande': 1,
  'single': 1,
  'suites': 1,
  'the': 5,
  'traditional': 1},
 60669: {'a': 3,
  'be': 1,
  'beam': 1,
  'by': 1,
  'collimated': 1,
  'form': 1,
  'in': 1,
  'laser': 1,
  'lens': 1,
  'like': 1,
  'must': 1,
  'order': 1,
  'pointer': 1,
  'produced': 1,
  'that': 1,
  'to': 1,
  'used': 1},
 209325: {'and': 2,
  'b': 1,
  'chetii': 1,
  'consciousness': 1,
  'david': 1,
  'kniga': 1,
  'makarii': 1,
  'metropolitan': 1,
  'miller': 1,
  'minei': 1,
  'national': 1,
  'of': 2,
  'origins': 1,
  'russian': 1,
  'stepennaia': 1,
  'the': 3,
  'velikie': 1},
 245837: {'a': 1,
  'according': 1,
  'book': 1,
  'byu': 1,
  'claims': 1,
  'derived': 1,
  'entirely': 1,
  'from': 1,
  'hebrews': 1,
  'hill': 1,
  'indians': 1,
  'maintained': 1,
  'marvin':

In [None]:
# 문서 번호로 정렬
sorted(freq_dict.items())

In [None]:
# 문서 번호 수로 정렬
# 정렬 방법 (생각) : keys(문서번호), values(단어별 count) 따로 분리한다음
# values(단어별 count) 를 빈도수 순으로 정렬하고, 이를 keys 랑 다시 합쳐서 dict 으로 만든 다음
# 그 dict 을 keys 순으로 정렬한다

In [56]:
keys, values = zip(*freq_dict.items())
sorted(keys)[:3]
#keys[0] = int(keys[0])
#keys[0]

[7454, 8701, 22399]

In [None]:
#freq_dict
"""
8701: {'2009': 1,
  'confusion': 1,
  'guide': 1,
  'hashin': 1,
  'mixed': 1,
  'tigers': 1,
  'up': 1},
"""

In [61]:
freq_dict[8701]

{'2009': 1,
 'confusion': 1,
 'guide': 1,
 'hashin': 1,
 'mixed': 1,
 'tigers': 1,
 'up': 1}

In [66]:
# 정렬된 최종 딕셔너리
keys, values = zip(*freq_dict.items())
sorted_freq_dict = {}

for key in sorted(keys):
  v = dict(sorted(freq_dict[key].items(), key = lambda item: item[1], reverse = True))
  a = {key : v}
  sorted_freq_dict.update(a)

sorted_freq_dict

{7454: {'before': 1,
  'days': 1,
  'first': 1,
  'for': 1,
  'his': 1,
  'lookout': 1,
  'spotted': 1,
  'tasmania': 1,
  'the': 1,
  'this': 1,
  'time': 1,
  'two': 1,
  'was': 1},
 8701: {'2009': 1,
  'confusion': 1,
  'guide': 1,
  'hashin': 1,
  'mixed': 1,
  'tigers': 1,
  'up': 1},
 22399: {'actually': 1,
  'benign': 1,
  'despite': 1,
  'every': 1,
  'exhibits': 1,
  'eye': 1,
  'five': 1,
  'in': 1,
  'its': 1,
  'mutation': 1,
  'one': 1,
  'only': 1,
  'prominence': 1,
  'third': 1,
  'this': 1,
  'transgolians': 1},
 25190: {'access': 2,
  'and': 2,
  'on': 2,
  'the': 2,
  'to': 2,
  'an': 1,
  'articles': 1,
  'completing': 1,
  'concern': 1,
  'disabled': 1,
  'early': 1,
  'education': 1,
  'educations': 1,
  'from': 1,
  'had': 1,
  'higher': 1,
  'invited': 1,
  'knew': 1,
  'laurie': 1,
  'negotiated': 1,
  'people': 1,
  'seriously': 1,
  'she': 1,
  'subject': 1,
  'successfully': 1,
  'their': 1,
  'was': 1,
  'way': 1,
  'well': 1,
  'were': 1,
  'whom': 1},
 50

In [90]:
print(sorted_freq_dict[8701].items())
print()
print(sorted_freq_dict[8701].keys())
print()
print(sorted_freq_dict[8701].values())

dict_items([('2009', 1), ('confusion', 1), ('guide', 1), ('hashin', 1), ('mixed', 1), ('tigers', 1), ('up', 1)])

dict_keys(['2009', 'confusion', 'guide', 'hashin', 'mixed', 'tigers', 'up'])

dict_values([1, 1, 1, 1, 1, 1, 1])


In [97]:
print(list(sorted_freq_dict[8701].items())[0])
print(list(sorted_freq_dict[8701].items())[0][0])
#print(list(sorted_freq_dict[8701].items())['2009'][1])

('2009', 1)
2009


In [85]:
list(sorted_freq_dict.keys())[:3]

[7454, 8701, 22399]

In [113]:
sorted_freq_dict.get(50966).get('a')

1

In [127]:
# 입력한 단어에 대한 정보 출력
search = input('단어를 입력하세요: ')

keys_lst = list(sorted_freq_dict.keys())  # 문서 번호 리스트
search_df = pd.DataFrame([], columns=['words', 'id', 'frequency'])  # 결과물을 받을 빈 데이터 프레임

cnt = 0
for key in keys_lst:
  if search in sorted_freq_dict[key].keys():
    fre = sorted_freq_dict.get(key).get(search)
    search_df.loc[cnt] = {'words' : search, 'id' : key, 'frequency' : fre}
    cnt =+ 1
        #[search, key, sorted_freq_dict.get(key).get(search)], columns=['words', 'id', 'frequency'])
  else:
    pass

search_df

단어를 입력하세요: a


Unnamed: 0,words,id,frequency
0,a,50966,1
1,a,277606,3


In [None]:
# 결과물 csv 파일로 저장
#freq.to_csv('/content/drive/MyDrive/3차 프로젝트 (230125 ~ 230222)/1주차 과제/input_big_frequency.csv')

## input.big Data

In [None]:
import pandas as pd
import numpy as np

In [128]:
# input.big 데이터 불러오기
df = pd.read_csv("/content/sample_data/input.big", sep='\t')

print(df.shape)
df[:3]

(279628, 1)


Unnamed: 0,"174690 Times online Chirac stated that ""The Republic is not a dictatorship of rumours, a dictatorship of calumny."""
0,211774 The English Suites closely follow the t...
1,60669 A lens must be used in order to form a c...
2,"209325 David B. Miller, ""The Velikie Minei Che..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 1 columns):
 #   Column                                                                                                              Non-Null Count  Dtype 
---  ------                                                                                                              --------------  ----- 
 0   174690 Times online Chirac stated that "The Republic is not a dictatorship of rumours, a dictatorship of calumny."  49 non-null     object
dtypes: object(1)
memory usage: 520.0+ bytes


In [129]:
# 칼럼명 변경
df.rename(columns={'174690 Times online Chirac stated that "The Republic is not a dictatorship of rumours, a dictatorship of calumny."' : 'sentences'}, inplace=True)

# 칼럼명에 있던 문장을 새로운 행으로 추가
df.loc[279628] = ['174690 Times online Chirac stated that "The Republic is not a dictatorship of rumours, a dictatorship of calumny."']

df

Unnamed: 0,sentences
0,211774 The English Suites closely follow the t...
1,60669 A lens must be used in order to form a c...
2,"209325 David B. Miller, ""The Velikie Minei Che..."
3,245837 According to BYU professor Marvin S. Hi...
4,25190 Access to higher education was an early ...
...,...
279624,134456 The plant faced criticism in the past d...
279625,"190067 Sideshow: Kissinger, Nixon and the Dest..."
279626,89422 This is probably from the influence of a...
279627,207486 Readings on the exceptional child.


In [130]:
# 모든 문장을 소문자로 변경
df['sentences'] = df['sentences'].str.lower()

# 각 문장 내의 특수문자를 띄어쓰기로 변경
import re
df['sentences'] = df['sentences'].str.replace("[^a-z0-9]", ' ')
df[:3]

  df['sentences'] = df['sentences'].str.replace("[^a-z0-9]", ' ')


Unnamed: 0,sentences
0,211774 the english suites closely follow the t...
1,60669 a lens must be used in order to form a c...
2,209325 david b miller the velikie minei che...


In [131]:
# 띄어쓰기 단위로 분리
df['sentences'] = df['sentences'].str.split()
df[:1]

Unnamed: 0,sentences
0,"[211774, the, english, suites, closely, follow..."


## 딕셔너리 사용

In [132]:
# 몇번 문장에서 특정 단어가 몇번 나오는지 count 하는 딕셔너리 생성
freq_dict = {}

for i in range(len(df)):
  unique, counts = np.unique(df['sentences'][i][1:], return_counts=True)
  a = {int(df['sentences'][i][0]) : dict(zip(unique, counts))}
  freq_dict.update(a)

freq_dict
#sorted(freq_dict.items(), reverse=True)[:3]

{211774: {'a': 2,
  'adding': 1,
  'allemande': 1,
  'and': 2,
  'before': 1,
  'between': 1,
  'closely': 1,
  'english': 1,
  'follow': 1,
  'gigue': 1,
  'including': 1,
  'model': 1,
  'movement': 1,
  'prelude': 1,
  'sarabande': 1,
  'single': 1,
  'suites': 1,
  'the': 5,
  'traditional': 1},
 60669: {'a': 3,
  'be': 1,
  'beam': 1,
  'by': 1,
  'collimated': 1,
  'form': 1,
  'in': 1,
  'laser': 1,
  'lens': 1,
  'like': 1,
  'must': 1,
  'order': 1,
  'pointer': 1,
  'produced': 1,
  'that': 1,
  'to': 1,
  'used': 1},
 209325: {'and': 2,
  'b': 1,
  'chetii': 1,
  'consciousness': 1,
  'david': 1,
  'kniga': 1,
  'makarii': 1,
  'metropolitan': 1,
  'miller': 1,
  'minei': 1,
  'national': 1,
  'of': 2,
  'origins': 1,
  'russian': 1,
  'stepennaia': 1,
  'the': 3,
  'velikie': 1},
 245837: {'a': 1,
  'according': 1,
  'book': 1,
  'byu': 1,
  'claims': 1,
  'derived': 1,
  'entirely': 1,
  'from': 1,
  'hebrews': 1,
  'hill': 1,
  'indians': 1,
  'maintained': 1,
  'marvin':

In [None]:
# 문서 번호 수로 정렬
# 정렬 방법 (생각) : keys(문서번호), values(단어별 count) 따로 분리한다음
# values(단어별 count) 를 빈도수 순으로 정렬하고, 이를 keys 랑 다시 합쳐서 dict 으로 만든 다음
# 그 dict 을 keys 순으로 정렬한다

In [133]:
keys, values = zip(*freq_dict.items())
sorted(keys)[:3]
#keys[0] = int(keys[0])
#keys[0]

[1, 2, 3]

In [None]:
#freq_dict
"""
8701: {'2009': 1,
  'confusion': 1,
  'guide': 1,
  'hashin': 1,
  'mixed': 1,
  'tigers': 1,
  'up': 1},
"""

In [134]:
freq_dict[8701]

{'2009': 1,
 'confusion': 1,
 'guide': 1,
 'hashin': 1,
 'mixed': 1,
 'tigers': 1,
 'up': 1}

In [135]:
# 정렬된 최종 딕셔너리
keys, values = zip(*freq_dict.items())
sorted_freq_dict = {}

for key in sorted(keys):
  v = dict(sorted(freq_dict[key].items(), key = lambda item: item[1], reverse = True))
  a = {key : v}
  sorted_freq_dict.update(a)

sorted_freq_dict

{1: {'the': 3,
  '1930s': 1,
  '1940s': 1,
  'and': 1,
  'defensive': 1,
  'dilapidated': 1,
  'even': 1,
  'fortifications': 1,
  'had': 1,
  'in': 1,
  'modern': 1,
  'of': 1,
  'showing': 1,
  'still': 1,
  'that': 1,
  'usefulness': 1,
  'warfare': 1},
 2: {'a': 1,
  'and': 1,
  'celtic': 1,
  'cemented': 1,
  'day': 1,
  'finally': 1,
  'had': 1,
  'home': 1,
  'huge': 1,
  'permanent': 1,
  'present': 1,
  'see': 1,
  'success': 1,
  'the': 1,
  'this': 1,
  'to': 1,
  'would': 1},
 3: {'called': 2,
  'is': 2,
  'round': 2,
  'a': 1,
  'areas': 1,
  'hand': 1,
  'in': 1,
  'pakki': 1,
  'pure': 1,
  'the': 1,
  'this': 1,
  'trail': 1,
  'trio': 1,
  'where': 1},
 4: {'were': 2,
  '1000': 1,
  'but': 1,
  'candle': 1,
  'clashes': 1,
  'deployed': 1,
  'in': 1,
  'lit': 1,
  'more': 1,
  'no': 1,
  'officers': 1,
  'police': 1,
  'reported': 1,
  'than': 1,
  'the': 1,
  'vigil': 1},
 5: {'by': 1,
  'catch': 1,
  'edges': 1,
  'follows': 1,
  'it': 1,
  'its': 1,
  'of': 1,
  'of

In [137]:
# 입력한 단어에 대한 정보 출력
search = input('단어를 입력하세요: ')

keys_lst = list(sorted_freq_dict.keys())  # 문서 번호 리스트
search_df = pd.DataFrame([], columns=['words', 'id', 'frequency'])  # 결과물을 받을 빈 데이터 프레임

cnt = 0

for key in keys_lst:
  if search in sorted_freq_dict[key].keys():
    fre = sorted_freq_dict.get(key).get(search)
    search_df.loc[cnt] = {'words' : search, 'id' : key, 'frequency' : fre}
    cnt =+ 1
  else:
    pass

search_df

# 41초 걸림

단어를 입력하세요: is


Unnamed: 0,words,id,frequency
0,is,3,2
1,is,279626,1


In [None]:
# 결과물 csv 파일로 저장
#freq.to_csv('/content/drive/MyDrive/3차 프로젝트 (230125 ~ 230222)/1주차 과제/input_big_frequency.csv')