diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7b0c909 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +Makefile +README +blib/ +inc/ +test.pl +private-* \ No newline at end of file diff --git a/.shipit b/.shipit new file mode 100644 index 0000000..2235740 --- /dev/null +++ b/.shipit @@ -0,0 +1,4 @@ +steps = FindVersion, ChangeVersion, CheckChangeLog, DistTest, Commit, Tag, MakeDist, UploadCPAN + +git.tagpattern = %v +git.push_to = origin diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..73e8e2a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,3 @@ +language: "perl" +before_install: + - perl Makefile.PL | cpanm diff --git a/MANIFEST.SKIP b/MANIFEST.SKIP new file mode 100644 index 0000000..aa7d6df --- /dev/null +++ b/MANIFEST.SKIP @@ -0,0 +1,19 @@ +\bRCS\b +\bCVS\b +^MANIFEST\. +^Makefile$ +~$ +\.old$ +^blib/ +^pm_to_blib +^MakeMaker-\d +\.gz$ +\.cvsignore +^9\d_.*\.t +^\.git/ +^\.shipit$ +\.gitignore$ +/\.git/ +^\.gitmodules$ +^\.travis.yml$ +^README\.md$ diff --git a/Makefile.PL b/Makefile.PL new file mode 100644 index 0000000..0a55b12 --- /dev/null +++ b/Makefile.PL @@ -0,0 +1,38 @@ +use strict; +use warnings; + +BEGIN { + my @devmods = qw( + Module::Install::AuthorTests + Module::Install::ReadmeFromPod + Module::Install::Repository + ); + my @not_available; + + eval qq{use inc::Module::Install; 1;} or push @not_available, 'inc::Module::Install'; + for my $mod (@devmods) { + eval qq{require $mod} or push @not_available, $mod; + } + if (@not_available) { + print qq{# The following modules are not available.\n}; + print qq{# `$^X $0 | cpanm` will install them:\n}; + print $_, "\n" for @not_available; + print "\n"; + exit -1; + } +} + +use inc::Module::Install; + +name 'Data-XLSX-Parser'; +all_from 'lib/Data/XLSX/Parser.pm'; + +readme_from 'lib/Data/XLSX/Parser.pm'; +author_tests 'xt'; +auto_set_repository; + +requires 'Archive::Zip'; +requires 'XML::Parser::Expat'; +requires 'File::Temp'; + +WriteAll; diff --git a/lib/Data/XLSX/Parser.pm b/lib/Data/XLSX/Parser.pm new file mode 100644 index 0000000..d52514f --- /dev/null +++ b/lib/Data/XLSX/Parser.pm @@ -0,0 +1,61 @@ +package Data::XLSX::Parser; +use strict; +use warnings; + +use Data::XLSX::Parser::DocumentArchive; +use Data::XLSX::Parser::Workbook; +use Data::XLSX::Parser::SharedStrings; +use Data::XLSX::Parser::Styles; +use Data::XLSX::Parser::Sheet; + +sub new { + my ($class) = @_; + + bless { + _row_event_handler => [], + _archive => undef, + _workbook => undef, + _shared_strings => undef, + }, $class; +} + +sub add_row_event_handler { + my ($self, $handler) = @_; + push @{ $self->{_row_event_handler} }, $handler; +} + +sub open { + my ($self, $file) = @_; + $self->{_archive} = Data::XLSX::Parser::DocumentArchive->new($file); +} + +sub workbook { + my ($self) = @_; + $self->{_workbook} ||= Data::XLSX::Parser::Workbook->new($self->{_archive}); +} + +sub shared_strings { + my ($self) = @_; + $self->{_shared_strings} ||= Data::XLSX::Parser::SharedStrings->new($self->{_archive}); +} + +sub styles { + my ($self) = @_; + $self->{_styles} ||= Data::XLSX::Parser::Styles->new($self->{_archive}); +} + +sub sheet { + my ($self, $sheet_id) = @_; + $self->{_sheet} ||= Data::XLSX::Parser::Sheet->new($self, $self->{_archive}, $sheet_id); +} + +sub _row_event { + my ($self, $row) = @_; + + my $row_vals = [map { $_->{v} } @$row]; + for my $handler (@{ $self->{_row_event_handler} }) { + $handler->($row_vals); + } +} + +1; diff --git a/lib/Data/XLSX/Parser/DocumentArchive.pm b/lib/Data/XLSX/Parser/DocumentArchive.pm new file mode 100644 index 0000000..86a55ef --- /dev/null +++ b/lib/Data/XLSX/Parser/DocumentArchive.pm @@ -0,0 +1,40 @@ +package Data::XLSX::Parser::DocumentArchive; +use strict; +use warnings; + +use Archive::Zip; + +sub new { + my ($class, $filename) = @_; + + my $zip = Archive::Zip->new; + if ($zip->read($filename) != Archive::Zip::AZ_OK) { + die "Cannot open file: $filename"; + } + + bless { + _zip => $zip, + }, $class; +} + +sub workbook { + my ($self) = @_; + $self->{_zip}->memberNamed('xl/workbook.xml'); +} + +sub sheet { + my ($self, $id) = @_; + $self->{_zip}->memberNamed(sprintf 'xl/worksheets/sheet%s.xml', $id); +} + +sub shared_strings { + my ($self) = @_; + $self->{_zip}->memberNamed('xl/sharedStrings.xml'); +} + +sub styles { + my ($self) = @_; + $self->{_zip}->memberNamed('xl/styles.xml'); +} + +1; diff --git a/lib/Data/XLSX/Parser/SharedStrings.pm b/lib/Data/XLSX/Parser/SharedStrings.pm new file mode 100644 index 0000000..28d3c4f --- /dev/null +++ b/lib/Data/XLSX/Parser/SharedStrings.pm @@ -0,0 +1,66 @@ +package Data::XLSX::Parser::SharedStrings; +use strict; +use warnings; + +use XML::Parser::Expat; +use Archive::Zip (); +use File::Temp; + +sub new { + my ($class, $archive) = @_; + + my $self = bless { + _data => [], + + _is_string => 0, + _buf => '', + }, $class; + + my $fh = File::Temp->new( SUFFIX => '.xml' ); + + my $handle = $archive->shared_strings or return $self; + die 'Failed to write temporally file: ', $fh->filename + unless $handle->extractToFileNamed($fh->filename) == Archive::Zip::AZ_OK; + + my $parser = XML::Parser::Expat->new; + $parser->setHandlers( + Start => sub { $self->_start(@_) }, + End => sub { $self->_end(@_) }, + Char => sub { $self->_char(@_) }, + ); + $parser->parse($fh); + + $self; +} + +sub count { + my ($self) = @_; + scalar @{ $self->{_data} }; +} + +sub get { + my ($self, $index) = @_; + $self->{_data}->[$index]; +} + +sub _start { + my ($self, $parser, $name, %attrs) = @_; + $self->{_is_string} = 1 if $name eq 'si'; +} + +sub _end { + my ($self, $parser, $name) = @_; + $self->{_is_string} = 0; + + if ($name eq 'si') { + push @{ $self->{_data} }, $self->{_buf}; + $self->{_buf} = ''; + } +} + +sub _char { + my ($self, $parser, $data) = @_; + $self->{_buf} .= $data if $self->{_is_string}; +} + +1; diff --git a/lib/Data/XLSX/Parser/Sheet.pm b/lib/Data/XLSX/Parser/Sheet.pm new file mode 100644 index 0000000..ad8dd97 --- /dev/null +++ b/lib/Data/XLSX/Parser/Sheet.pm @@ -0,0 +1,158 @@ +package Data::XLSX::Parser::Sheet; +use strict; +use warnings; + +use File::Temp; +use XML::Parser::Expat; +use Archive::Zip (); + +use constant { + STYLE_IDX => 'i', + STYLE => 's', + FMT => 'f', + REF => 'r', + COLUMN => 'c', + VALUE => 'v', + TYPE => 't', + TYPE_SHARED_STRING => 's', + GENERATED_CELL => 'g', +}; + +sub new { + my ($class, $doc, $archive, $sheet_id) = @_; + + my $self = bless { + _document => $doc, + + _data => '', + _is_sheetdata => 0, + _row_count => 0, + _current_row => [], + _cell => undef, + _is_value => 0, + + _shared_strings => $doc->shared_strings, + _styles => $doc->styles, + + }, $class; + + my $fh = File::Temp->new( SUFFIX => '.xml' ); + + my $handle = $archive->sheet($sheet_id); + die 'Failed to write temporally file: ', $fh->filename + unless $handle->extractToFileNamed($fh->filename) == Archive::Zip::AZ_OK; + + my $parser = XML::Parser::Expat->new; + $parser->setHandlers( + Start => sub { $self->_start(@_) }, + End => sub { $self->_end(@_) }, + Char => sub { $self->_char(@_) }, + ); + $parser->parse($fh); + + $self; +} + +sub _start { + my ($self, $parser, $name, %attrs) = @_; + + if ($name eq 'sheetData') { + $self->{_is_sheetdata} = 1; + } + elsif ($self->{_is_sheetdata} and $name eq 'row') { + $self->{_current_row} = []; + } + elsif ($name eq 'c') { + $self->{_cell} = { + STYLE_IDX() => $attrs{ STYLE_IDX() }, + TYPE() => $attrs{ TYPE() }, + REF() => $attrs{ REF() }, + COLUMN() => scalar(@{ $self->{_current_row} }) + 1, + }; + } + elsif ($name eq 'v') { + $self->{_is_value} = 1; + } +} + +sub _end { + my ($self, $parser, $name) = @_; + + if ($name eq 'sheetData') { + $self->{_is_sheetdata} = 0; + } + elsif ($self->{_is_sheetdata} and $name eq 'row') { + $self->{_row_count}++; + $self->{_document}->_row_event( delete $self->{_current_row} ); + } + elsif ($name eq 'c') { + my $c = $self->{_cell}; + $self->_parse_rel($c); + + if (($c->{ TYPE() } || '') eq TYPE_SHARED_STRING()) { + my $idx = int($self->{_data}); + $c->{ VALUE() } = $self->{_shared_strings}->get($idx); + } + else { + $c->{ VALUE() } = $self->{_data}; + } + + $c->{ STYLE() } = $self->{_styles}->cell_style( $c->{ STYLE_IDX() } ); + $c->{ FMT() } = my $cell_type = + $self->{_styles}->cell_type_from_style($c->{ STYLE() }); + + my $v = $c->{ VALUE() }; + if (defined $v and $c->{ FMT() } =~ /^datetime\.(date)?(time)?$/) { + # datetime + warn 'datetime'; + } + else { + if (!defined $v) { + $c->{ VALUE() } = ''; + } + elsif ($cell_type ne 'unicode') { + warn 'not unicode'; + $c->{ VALUE() } = $v; + } + } + + push @{ $self->{_current_row} }, $c; + + $self->{_data} = ''; + $self->{_cell} = undef; + } + elsif ($name eq 'v') { + $self->{_is_value} = 0; + } +} + +sub _char { + my ($self, $parser, $data) = @_; + + if ($self->{_is_value}) { + $self->{_data} .= $data; + } +} + +sub _parse_rel { + my ($self, $cell) = @_; + + my ($column, $row) = $cell->{ REF() } =~ /([A-Z]+)(\d+)/; + + my $v = 0; + my $i = 0; + for my $ch (split '', $column) { + my $s = length($column) - $i++ - 1; + $v += (ord($ch) - ord('A') + 1) * (26**$s); + } + + $cell->{ REF() } = [$v, $row]; + + if ($cell->{ COLUMN() } > $v) { + die sprintf 'Detected smaller index than current cell, something is wrong! (row %s): %s <> %s', $row, $v, $cell->{ COLUMN() }; + } +} + +1; + + diff --git a/lib/Data/XLSX/Parser/Styles.pm b/lib/Data/XLSX/Parser/Styles.pm new file mode 100644 index 0000000..8bf872c --- /dev/null +++ b/lib/Data/XLSX/Parser/Styles.pm @@ -0,0 +1,139 @@ +package Data::XLSX::Parser::Styles; +use strict; +use warnings; + +use XML::Parser::Expat; +use Archive::Zip (); +use File::Temp; + +use constant BUILTIN_FMT => 0; +use constant BUILTIN_TYPE => 1; + +use constant BUILTIN_NUM_FMTS => [ + ['@', 'unicode'], # 0x00 + ['0', 'int'], # 0x01 + ['0.00', 'float'], # 0x02 + ['#,##0', 'float'], # 0x03 + ['#,##0.00', 'float'], # 0x04 + ['($#,##0_);($#,##0)', 'float'], # 0x05 + ['($#,##0_);[RED]($#,##0)', 'float'], # 0x06 + ['($#,##0.00_);($#,##0.00_)', 'float'], # 0x07 + ['($#,##0.00_);[RED]($#,##0.00_)', 'float'], # 0x08 + ['0%', 'int'], # 0x09 + ['0.00%', 'float'], # 0x0a + ['0.00E+00', 'float'], # 0x0b + ['# ?/?', 'float'], # 0x0c + ['# ??/??', 'float'], # 0x0d + ['m-d-yy', 'datetime.date'], # 0x0e + ['d-mmm-yy', 'datetime.date'], # 0x0f + ['d-mmm', 'datetime.date'], # 0x10 + ['mmm-yy', 'datetime.date'], # 0x11 + ['h:mm AM/PM', 'datetime.time'], # 0x12 + ['h:mm:ss AM/PM', 'datetime.time'], # 0x13 + ['h:mm', 'datetime.time'], # 0x14 + ['h:mm:ss', 'datetime.time'], # 0x15 + ['m-d-yy h:mm', 'datetime.datetime'], # 0x16 + #0x17-0x24 -- Differs in Natinal + undef, # 0x17 + undef, # 0x18 + undef, # 0x19 + undef, # 0x1a + undef, # 0x1b + undef, # 0x1c + undef, # 0x1d + undef, # 0x1e + undef, # 0x1f + undef, # 0x20 + undef, # 0x21 + undef, # 0x22 + undef, # 0x23 + undef, # 0x24 + ['(#,##0_);(#,##0)', 'int'], # 0x25 + ['(#,##0_);[RED](#,##0)', 'int'], # 0x26 + ['(#,##0.00);(#,##0.00)', 'float'], # 0x27 + ['(#,##0.00);[RED](#,##0.00)', 'float'], # 0x28 + ['_(*#,##0_);_(*(#,##0);_(*"-"_);_(@_)', 'float'], # 0x29 + ['_($*#,##0_);_($*(#,##0);_(*"-"_);_(@_)', 'float'], # 0x2a + ['_(*#,##0.00_);_(*(#,##0.00);_(*"-"??_);_(@_)', 'float'], # 0x2b + ['_($*#,##0.00_);_($*(#,##0.00);_(*"-"??_);_(@_)', 'float'], # 0x2c + ['mm:ss', 'datetime.timedelta'], # 0x2d + ['[h]:mm:ss', 'datetime.timedelta'], # 0x2e + ['mm:ss.0', 'datetime.timedelta'], # 0x2f + ['##0.0E+0', 'float'], # 0x30 + ['@', 'unicode'], # 0x31 +]; + +sub new { + my ($class, $archive) = @_; + + my $self = bless { + _number_formats => [], + + _is_cell_xfs => 0, + _current_style => undef, + }, $class; + + my $fh = File::Temp->new( SUFFIX => '.xml' ); + + my $handle = $archive->styles; + die 'Failed to write temporally file: ', $fh->filename + unless $handle->extractToFileNamed($fh->filename) == Archive::Zip::AZ_OK; + + my $parser = XML::Parser::Expat->new; + $parser->setHandlers( + Start => sub { $self->_start(@_) }, + End => sub { $self->_end(@_) }, + Char => sub { }, + ); + $parser->parse($fh); + + $self; +} + +sub cell_style { + my ($self, $style_id) = @_; + $style_id ||= 0; + $self->{_number_formats}[int $style_id]; +} + +sub cell_type_from_style { + my ($self, $style) = @_; + BUILTIN_NUM_FMTS->[ $style->{numFmt} ][BUILTIN_TYPE]; +} + +sub cell_format_from_style { + my ($self, $style) = @_; + BUILTIN_NUM_FMTS->[ $style->{numFmt} ][BUILTIN_FMT]; +} + +sub _start { + my ($self, $parser, $name, %attrs) = @_; + + if ($name eq 'cellXfs') { + $self->{_is_cell_xfs} = 1; + } + elsif ($self->{_is_cell_xfs} and $name eq 'xf') { + $self->{_current_style} = { + numFmt => int($attrs{numFmtId}) || 0, + exists $attrs{fontId} ? ( font => $attrs{fontId} ) : (), + exists $attrs{fillId} ? ( fill => $attrs{fillId} ) : (), + exists $attrs{borderId} ? ( border => $attrs{borderId} ) : (), + exists $attrs{xfId} ? ( xf => $attrs{xfId} ) : (), + exists $attrs{applyFont} ? ( applyFont => $attrs{applyFont} ) : (), + exists $attrs{applyNumberFormat} ? ( applyNumFmt => $attrs{applyNumberFormat} ) : (), + }; + } +} + +sub _end { + my ($self, $parser, $name) = @_; + + if ($name eq 'cellXfs') { + $self->{_is_cell_xfs} = 0; + } + elsif ($self->{_current_style} and $name eq 'xf') { + push @{ $self->{_number_formats } }, delete $self->{_current_style}; + } +} + +1; diff --git a/lib/Data/XLSX/Parser/Workbook.pm b/lib/Data/XLSX/Parser/Workbook.pm new file mode 100644 index 0000000..f6bf326 --- /dev/null +++ b/lib/Data/XLSX/Parser/Workbook.pm @@ -0,0 +1,56 @@ +package Data::XLSX::Parser::Workbook; +use strict; +use warnings; + +use XML::Parser::Expat; +use Archive::Zip (); +use File::Temp; + +sub new { + my ($class, $archive) = @_; + + my $self = bless [], $class; + + my $fh = File::Temp->new( SUFFIX => '.xml' ); + + my $handle = $archive->workbook; + die 'Failed to write temporally file: ', $fh->filename + unless $handle->extractToFileNamed($fh->filename) == Archive::Zip::AZ_OK; + + my $parser = XML::Parser::Expat->new; + $parser->setHandlers( + Start => sub { $self->_start(@_) }, + End => sub {}, + Char => sub {}, + ); + $parser->parse($fh); + + $self; +} + +sub names { + my ($self) = @_; + map { $_->{name} } @$self; +} + +sub sheet_id { + my ($self, $name) = @_; + + my ($meta) = grep { $_->{name} eq $name } @$self + or return; + + if ($meta->{'r:id'}) { + (my $r = $meta->{'r:id'}) =~ s/^rId//; + return $r; + } + else { + return $meta->{sheetId}; + } +} + +sub _start { + my ($self, $parser, $el, %attr) = @_; + push @$self, \%attr if $el eq 'sheet'; +} + +1; diff --git a/t/1_____loreyna126.t b/t/1_____loreyna126.t new file mode 100644 index 0000000..5bf360f --- /dev/null +++ b/t/1_____loreyna126.t @@ -0,0 +1,33 @@ +use strict; +use warnings; +use utf8; + +use FindBin; +use Test::More; + +use Data::XLSX::Parser; + +my $parser = Data::XLSX::Parser->new; + +my $fn = __FILE__; +$fn =~ s{t$}{xlsx}; + +$parser->open($fn); + +my @sheets = $parser->workbook->names; +is scalar @sheets, 3, '3 worksheets ok'; + +is $sheets[0], 'POST_DSENDS', 'sheet1 name ok'; + +my $cells = []; +$parser->add_row_event_handler(sub { + my ($row) = @_; + push @$cells, $row; +}); + +$parser->sheet(1); + +is $cells->[112][0], 'RCS Thrust Vector Uncertainties ', 'val ok'; + +done_testing; + diff --git a/t/1_____loreyna126.xlsx b/t/1_____loreyna126.xlsx new file mode 100755 index 0000000..5619fb4 Binary files /dev/null and b/t/1_____loreyna126.xlsx differ diff --git a/t/2_____with_chart.t b/t/2_____with_chart.t new file mode 100644 index 0000000..4a2bc31 --- /dev/null +++ b/t/2_____with_chart.t @@ -0,0 +1,36 @@ +use strict; +use warnings; +use utf8; + +use FindBin; +use Test::More; + +use Data::XLSX::Parser; + +my $parser = Data::XLSX::Parser->new; + +my $fn = __FILE__; +$fn =~ s{t$}{xlsx}; + +$parser->open($fn); + +my @sheets = $parser->workbook->names; +is scalar @sheets, 4, '4 worksheets ok'; + +is $sheets[0], 'Tabelle1', 'sheet1 name ok'; + +my $cells = []; +$parser->add_row_event_handler(sub { + my ($row) = @_; + push @$cells, $row; +}); + +$parser->sheet($parser->workbook->sheet_id($sheets[0])); + +is $cells->[0][0], 1, 'val 0,0 ok'; +is $cells->[0][1], 10, 'val 0,1 ok'; +is $cells->[1][0], 2, 'val 1,0 ok'; +is $cells->[1][1], 20, 'val 1,1 ok'; + +done_testing; + diff --git a/t/2_____with_chart.xlsx b/t/2_____with_chart.xlsx new file mode 100755 index 0000000..f550b0b Binary files /dev/null and b/t/2_____with_chart.xlsx differ diff --git a/xt/sharedstrings.t b/xt/sharedstrings.t new file mode 100644 index 0000000..32f2260 --- /dev/null +++ b/xt/sharedstrings.t @@ -0,0 +1,23 @@ +use strict; +use warnings; +use utf8; +use FindBin; + +use Test::More; + +use_ok 'Data::XLSX::Parser'; + +my $parser = Data::XLSX::Parser->new; +isa_ok $parser, 'Data::XLSX::Parser'; + +$parser->open("$FindBin::Bin/../private-data-20120717.xlsx"); + +my $shared_strings = $parser->shared_strings; + +is $shared_strings->count, 8161, 'count ok'; + +is $shared_strings->get(1), '問題文', 'get 1 ok'; +is $shared_strings->get(1000), 'リオデジャネイロ', 'get 1000 ok'; + + +done_testing; diff --git a/xt/sheet.t b/xt/sheet.t new file mode 100644 index 0000000..d368013 --- /dev/null +++ b/xt/sheet.t @@ -0,0 +1,24 @@ +use strict; +use warnings; +use utf8; +use FindBin; + +use Test::More; + +use_ok 'Data::XLSX::Parser'; + +my $parser = Data::XLSX::Parser->new; +isa_ok $parser, 'Data::XLSX::Parser'; + +$parser->open("$FindBin::Bin/../private-data-20120717.xlsx"); + +my $workbook = $parser->workbook; +isa_ok $workbook, 'Data::XLSX::Parser::Workbook'; + +my @names = $workbook->names; +is scalar @names, 2, '2 workbook ok'; + +my $sheet_id = $workbook->sheet_id($names[0]); +my $sheet = $parser->sheet($sheet_id); + +done_testing; diff --git a/xt/styles.t b/xt/styles.t new file mode 100644 index 0000000..78ed0ef --- /dev/null +++ b/xt/styles.t @@ -0,0 +1,19 @@ +use strict; +use warnings; +use utf8; +use FindBin; + +use Test::More; + +use_ok 'Data::XLSX::Parser'; + +my $parser = Data::XLSX::Parser->new; +isa_ok $parser, 'Data::XLSX::Parser'; + +$parser->open("$FindBin::Bin/../private-data-20120717.xlsx"); + +my $styles = $parser->styles; + + + +done_testing; diff --git a/xt/workbook.t b/xt/workbook.t new file mode 100644 index 0000000..8aba7c9 --- /dev/null +++ b/xt/workbook.t @@ -0,0 +1,22 @@ +use strict; +use warnings; +use Test::More; +use FindBin; + +use_ok 'Data::XLSX::Parser'; + +my $parser = Data::XLSX::Parser->new; +isa_ok $parser, 'Data::XLSX::Parser'; + +$parser->open("$FindBin::Bin/../private-data-20120717.xlsx"); + +my $workbook = $parser->workbook; +isa_ok $workbook, 'Data::XLSX::Parser::Workbook'; + +my @names = $workbook->names; +is scalar @names, 2, '2 workbook ok'; + +is $workbook->sheet_id($names[0]), 1, 'sheet_id 1 ok'; +is $workbook->sheet_id($names[1]), 2, 'sheet_id 2 ok'; + +done_testing;