-
Notifications
You must be signed in to change notification settings - Fork 0
/
racingBarDataframe.py
69 lines (54 loc) · 2 KB
/
racingBarDataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from collections import Counter
import pandas as pd
from core.sentiment_analysis import create_cleaned
import spacy
from spacy_langdetect import LanguageDetector
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
stop_words = nlp.Defaults.stop_words
pd.options.mode.chained_assignment = None # default='warn' this is needed for false positives warnings on reassigning pd dataframe
def getPopularWords(df, amountNeeded):#Input a list of strings and amount of popular words needed, returns a list of popular words
inOne = ""
for message in df.text:
inOne += message + " "
justWords = inOne.split()#Splitting into words
popularWords = Counter(justWords)
return (popularWords.most_common(amountNeeded))#Looked into source code of counter and appears to be O(n) time complexity
def dfTextToString(df):
df = create_cleaned(df, 'text')
text = df['cleaned_words'].tolist()
text = [' '.join(s) for s in text]
strText = ' '.join(text)
return strText
def splitByMonth(df):
df = df[df["created_at"].notna()]
dates = pd.to_datetime(df['created_at'])
df['created_at'] = dates
df = df.set_index('created_at')
strText = []
for group_name, df_group in df.groupby(pd.Grouper(freq='M')):
strText.append(df_group)
return strText
def groupDFByMonth(df):
df = df[df["created_at"].notna()]
dates = pd.to_datetime(df['created_at'])
df['created_at'] = dates
df = df.set_index('created_at')
return df.groupby(pd.Grouper(freq='M'))
COVIDdf = pd.read_csv('processed_data/final_csv.txt')
grouped = groupDFByMonth(COVIDdf)
table = []
for month,df in grouped:
df = create_cleaned(df, 'text')
popWords = getPopularWords(df, 30)
row = {'month':month}
for (word, count) in popWords:
row.update({word:count})
table.append(row)
Outdf = pd.DataFrame(table)
Outdf.set_index('month')
Outdf.to_csv('test.csv')
df = pd.read_csv('test.csv')
df = df.transpose()
print(df)
df.to_csv('racingDF.csv')