In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [57]:
data=pd.read_csv('../artifacts/sentiment_analysis.csv')

In [58]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


## Data preprossesing

In [59]:
data.shape

(7920, 3)

In [60]:
data.duplicated().sum()

0

In [61]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

### text preprossessing

In [62]:
import re
import string

In [63]:
data['tweet'].head(5)

0    #fingerprint #Pregnancy Test https://goo.gl/h1...
1    Finally a transparant silicon case ^^ Thanks t...
2    We love this! Would you go? #talk #makememorie...
3    I'm wired I know I'm George I was made that wa...
4    What amazing service! Apple won't even talk to...
Name: tweet, dtype: object

In [64]:
#convert uppercase to lowercase
data['tweet']= data['tweet'].apply(lambda x:" ".join(x.lower() for x in x.split()))

In [65]:
data['tweet'].head(5)

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

In [66]:
#remove links
data['tweet']= data['tweet'].apply(lambda x:" ".join(re.sub(r'(https?://\S+|www\.\S+)',' ',x,flags=re.MULTILINE) for x in x.split()))

In [67]:
data['tweet'].head(5)

0    #fingerprint #pregnancy test   #android #apps ...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

In [68]:
#remove punctuations
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ')
    return text

# Apply to the 'tweet' column
data['tweet'] = data['tweet'].apply(remove_punctuations)

In [69]:
data['tweet'].tail(5)

7915    live out loud  lol  liveoutloud  selfie  smile...
7916    we would like to wish you an amazing day  make...
7917    helping my lovely 90 year old neighbor with he...
7918    finally got my  smart  pocket  wifi stay conne...
7919    apple barcelona     apple  store  bcn  barcelo...
Name: tweet, dtype: object

In [70]:
#remove numbers
data['tweet'] = data['tweet'].str.replace(r'\d+', '', regex=True)

In [71]:
data['tweet'].tail(5)

7915    live out loud  lol  liveoutloud  selfie  smile...
7916    we would like to wish you an amazing day  make...
7917    helping my lovely  year old neighbor with her ...
7918    finally got my  smart  pocket  wifi stay conne...
7919    apple barcelona     apple  store  bcn  barcelo...
Name: tweet, dtype: object

In [76]:
pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/usr/local/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [83]:
#remove stopwords(am/is/are/...)
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

stopwords = set(ENGLISH_STOP_WORDS)

def remove_stopwords(text):
    words = text.split()
    cleaned = [w for w in words if w.lower() not in stopwords]
    return " ".join(cleaned)

data['tweet'] = data['tweet'].apply(remove_stopwords)


In [84]:
data['tweet'].head(5)

0    fingerprint pregnancy test android apps beauti...
1    finally transparant silicon case thanks uncle ...
2    love talk makememories unplug relax iphone sma...
3    m wired know m george way iphone cute daventry...
4    amazing service apple won t talk question unle...
Name: tweet, dtype: object

In [88]:
#nltk.download('stopwords',download_dir='/.../static/model/')

In [89]:
#stemming(base word[creating->create])
from nltk.stem import PorterStemmer

# Create a PorterStemmer object
ps = PorterStemmer()

# Apply stemming to the 'tweet' column
data["tweet"] = data["tweet"].apply(lambda x: " ".join(ps.stem(word) for word in x.split()))

In [90]:
data['tweet'].head(5)


0    fingerprint pregnanc test android app beauti c...
1    final transpar silicon case thank uncl yay son...
2    love talk makememori unplug relax iphon smartp...
3     m wire know m georg way iphon cute daventri home
4    amaz servic appl won t talk question unless pa...
Name: tweet, dtype: object

## building Vocabulary

In [95]:
from collections import Counter
vocab = Counter()

In [96]:
vocab

Counter()

In [99]:
data['tweet']

0       fingerprint pregnanc test android app beauti c...
1       final transpar silicon case thank uncl yay son...
2       love talk makememori unplug relax iphon smartp...
3        m wire know m georg way iphon cute daventri home
4       amaz servic appl won t talk question unless pa...
                              ...                        
7915    live loud lol liveoutloud selfi smile soni mus...
7916    like wish amaz day make minut count tl today i...
7917    help love year old neighbor ipad morn just rea...
7918    final got smart pocket wifi stay connect anyti...
7919    appl barcelona appl store bcn barcelona travel...
Name: tweet, Length: 7920, dtype: object

In [100]:
for sentence in data['tweet']:
    print(sentence)

fingerprint pregnanc test android app beauti cute health iger iphoneonli iphonesia iphon
final transpar silicon case thank uncl yay soni xperia s sonyexperias…
love talk makememori unplug relax iphon smartphon wifi connect
m wire know m georg way iphon cute daventri home
amaz servic appl won t talk question unless pay stupid support
iphon softwar updat fuck phone big time stupid iphon
happi instap instadaili soni xperia xperiaz
new type c charger cabl uk … bay amazon etsi new year rob cross tobi young evemun mcmafia taylor spectr newyear start recip technolog samsunggalaxi iphonex pic twitter com pjiwqwtc
bout shop listen music iphon justm music likeforlik followforfollow…
photo fun selfi pool water soni camera picoftheday sun instagood boy cute outdoor
hey appl make new ipod dont make new color inch thinner make crash fuckin minit
ha heavi machineri doe need appl realli drop ball design drinkyourhaterad
contempl give iphon bandwagon simpli cellcom new android depress idontwantto
just 

In [101]:
for sentence in data['tweet']:
    vocab.update(sentence.split())

In [104]:
vocab

Counter({'iphon': 4103,
         'appl': 2909,
         'samsung': 1407,
         'new': 1145,
         'twitter': 1114,
         'com': 1044,
         'phone': 1033,
         's': 901,
         'follow': 890,
         'soni': 821,
         '…': 775,
         't': 609,
         'ipad': 540,
         'rt': 534,
         'pic': 523,
         'love': 503,
         'like': 457,
         'just': 440,
         'day': 435,
         'android': 420,
         'app': 419,
         'life': 416,
         'photo': 394,
         'io': 390,
         'galaxi': 367,
         'instagram': 360,
         'case': 352,
         'cute': 324,
         'beauti': 323,
         'work': 313,
         'gain': 312,
         'today': 311,
         'happi': 296,
         'm': 294,
         'fuck': 294,
         'photographi': 294,
         'game': 285,
         'got': 281,
         'fun': 277,
         'thank': 267,
         'news': 266,
         'music': 265,
         'time': 253,
         'make': 247,
         'upda

In [105]:
len(vocab)

15775

In [106]:
data.shape

(7920, 3)

In [107]:
#so this take 15 775 columns and 7920 rows.So that over fitting. Here features(15 775) < data records(7920)
#so let's take features that has more that 10 times.

tokens = [key for key in vocab if vocab[key]>10]

In [108]:
tokens

['test',
 'android',
 'app',
 'beauti',
 'cute',
 'health',
 'iger',
 'iphoneonli',
 'iphonesia',
 'iphon',
 'final',
 'case',
 'thank',
 'yay',
 'soni',
 'xperia',
 's',
 'love',
 'talk',
 'relax',
 'smartphon',
 'wifi',
 'connect',
 'm',
 'know',
 'way',
 'home',
 'amaz',
 'servic',
 'appl',
 'won',
 't',
 'question',
 'pay',
 'stupid',
 'support',
 'softwar',
 'updat',
 'fuck',
 'phone',
 'big',
 'time',
 'happi',
 'instap',
 'instadaili',
 'xperiaz',
 'new',
 'type',
 'c',
 'charger',
 'cabl',
 'uk',
 '…',
 'amazon',
 'year',
 'newyear',
 'start',
 'technolog',
 'samsunggalaxi',
 'iphonex',
 'pic',
 'twitter',
 'com',
 'shop',
 'listen',
 'music',
 'likeforlik',
 'photo',
 'fun',
 'selfi',
 'water',
 'camera',
 'picoftheday',
 'sun',
 'instagood',
 'boy',
 'outdoor',
 'hey',
 'make',
 'ipod',
 'dont',
 'color',
 'inch',
 'crash',
 'doe',
 'need',
 'realli',
 'drop',
 'ball',
 'design',
 'give',
 'just',
 'crazi',
 'purchas',
 'lol',
 'work',
 'hard',
 'play',
 'ipad',
 'batteri',
 

In [110]:
len(tokens) # now only 1121

1121

In [113]:
def save_vocabulary(lines, filename):
    data = "\n". join(lines) 
    file = open(filename,'w', encoding="utf-8")
    file.write(data)
    file.close()
    
save_vocabulary(tokens, '/env/static/model/vocabulary.txt')

FileNotFoundError: [Errno 2] No such file or directory: '/env/static/model/vocabulary.txt'